fix(install): guard writeSettings against null settingsPath for cline runtime (#2035 )

* fix(install): guard writeSettings against null settingsPath for cline runtime Cline returns settingsPath: null from install() because it uses .clinerules instead of settings.json. The finishInstall() guard was missing !isCline, causing a crash with ERR_INVALID_ARG_TYPE when installing with the cline runtime. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com> * test(cline): add regression tests for ERR_INVALID_ARG_TYPE null settingsPath guard Adds two regression tests to tests/cline-install.test.cjs for gsd-build/get-shit-done#2044: - Assert install(false, 'cline') does not throw ERR_INVALID_ARG_TYPE - Assert settings.json is not written for cline runtime Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com> * test(cline): fix regression tests to directly call finishInstall with null settingsPath The previous regression tests called install() which returns early for cline before reaching finishInstall(), so the crash was never exercised. Fix by: - Exporting finishInstall from bin/install.js - Calling finishInstall(null, null, ..., 'cline') directly so the null settingsPath guard is actually tested Tests now fail (ERR_INVALID_ARG_TYPE) without the fix and pass with it. Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
fix(install): extend buildHookCommand to .sh hooks — absolute quoted paths (#2049 )
2026-04-25 17:25:23 +02:00 · 2026-04-10 13:58:16 -04:00 · 2026-04-10 13:55:27 -04:00 · 2026-04-10 13:55:13 -04:00 · 2026-04-10 12:41:59 -04:00 · 2026-04-10 12:30:46 -04:00
134 changed files with 9507 additions and 725 deletions
--- a/.github/workflows/hotfix.yml
+++ b/.github/workflows/hotfix.yml
@@ -37,7 +37,7 @@ jobs:
      base_tag: ${{ steps.validate.outputs.base_tag }}
      branch: ${{ steps.validate.outputs.branch }}
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          fetch-depth: 0

@@ -73,11 +73,11 @@ jobs:
    permissions:
      contents: write
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          fetch-depth: 0

-      - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
+      - uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f  # v6.3.0
        with:
          node-version: ${{ env.NODE_VERSION }}

@@ -124,12 +124,12 @@ jobs:
      id-token: write
    environment: npm-publish
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{ needs.validate-version.outputs.branch }}
          fetch-depth: 0

-      - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
+      - uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f  # v6.3.0
        with:
          node-version: ${{ env.NODE_VERSION }}
          registry-url: 'https://registry.npmjs.org'
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -38,7 +38,7 @@ jobs:
      branch: ${{ steps.validate.outputs.branch }}
      is_major: ${{ steps.validate.outputs.is_major }}
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          fetch-depth: 0

@@ -69,11 +69,11 @@ jobs:
    permissions:
      contents: write
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          fetch-depth: 0

-      - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
+      - uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f  # v6.3.0
        with:
          node-version: ${{ env.NODE_VERSION }}

@@ -123,12 +123,12 @@ jobs:
      id-token: write
    environment: npm-publish
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{ needs.validate-version.outputs.branch }}
          fetch-depth: 0

-      - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
+      - uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f  # v6.3.0
        with:
          node-version: ${{ env.NODE_VERSION }}
          registry-url: 'https://registry.npmjs.org'
@@ -251,12 +251,12 @@ jobs:
      id-token: write
    environment: npm-publish
    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{ needs.validate-version.outputs.branch }}
          fetch-depth: 0

-      - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
+      - uses: actions/setup-node@53b83947a5a98c8d113130e565377fae1a50d02f  # v6.3.0
        with:
          node-version: ${{ env.NODE_VERSION }}
          registry-url: 'https://registry.npmjs.org'
--- a/.github/workflows/security-scan.yml
+++ b/.github/workflows/security-scan.yml
@@ -4,6 +4,8 @@ on:
  pull_request:
    branches:
      - main
+      - 'release/**'
+      - 'hotfix/**'

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -4,6 +4,8 @@ on:
  push:
    branches:
      - main
+      - 'release/**'
+      - 'hotfix/**'
  pull_request:
    branches:
      - main
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,8 +7,31 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ## [Unreleased]

 ### Added
+- **Review model configuration** — Per-CLI model selection for /gsd-review via `review.models.<cli>` config keys. Falls back to CLI defaults when not set. (#1849)
+
+## [1.34.2] - 2026-04-06
+
+### Changed
+- **Node.js minimum lowered to 22** — `engines.node` was raised to `>=24.0.0` based on a CI matrix change, but Node 22 is still in Active LTS until October 2026. Restoring Node 22 support eliminates the `EBADENGINE` warning for users on the previous LTS line. CI matrix now tests against both Node 22 and Node 24.
+
+## [1.34.1] - 2026-04-06
+
+### Fixed
+- **npm publish catchup** — v1.33.0 and v1.34.0 were tagged but never published to npm; this release makes all changes available via `npx get-shit-done-cc@latest`
+- Removed npm v1.32.0 stuck notice from README
+
+## [1.34.0] - 2026-04-06
+
+### Added
+- **Gates taxonomy reference** — 4 canonical gate types (pre-flight, revision, escalation, abort) with phase matrix wired into plan-checker and verifier agents (#1781)
+- **Post-merge hunk verification** — `reapply-patches` now detects silently dropped hunks after three-way merge (#1775)
 - **Execution context profiles** — Three context profiles (`dev`, `research`, `review`) for mode-specific agent output guidance (#1807)

+### Fixed
+- **Shell hooks missing from npm package** — `hooks/*.sh` files excluded from tarball due to `hooks/dist` allowlist; changed to `hooks` (#1852 #1862)
+- **detectConfigDir priority** — `.claude` now searched first so Claude Code users don't see false update warnings when multiple runtimes are installed (#1860)
+- **Milestone backlog preservation** — `phases clear` no longer wipes 999.x backlog phases (#1858)
+
 ## [1.33.0] - 2026-04-05

 ### Added
@@ -1844,7 +1867,11 @@ Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - YOLO mode for autonomous execution
 - Interactive mode with checkpoints

-[Unreleased]: https://github.com/gsd-build/get-shit-done/compare/v1.30.0...HEAD
+[Unreleased]: https://github.com/gsd-build/get-shit-done/compare/v1.34.2...HEAD
+[1.34.2]: https://github.com/gsd-build/get-shit-done/releases/tag/v1.34.2
+[1.34.1]: https://github.com/gsd-build/get-shit-done/releases/tag/v1.34.1
+[1.34.0]: https://github.com/gsd-build/get-shit-done/releases/tag/v1.34.0
+[1.33.0]: https://github.com/gsd-build/get-shit-done/releases/tag/v1.33.0
 [1.30.0]: https://github.com/gsd-build/get-shit-done/releases/tag/v1.30.0
 [1.29.0]: https://github.com/gsd-build/get-shit-done/releases/tag/v1.29.0
 [1.28.0]: https://github.com/gsd-build/get-shit-done/releases/tag/v1.28.0
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -88,7 +88,7 @@ PRs that arrive without a properly-labeled linked issue are closed automatically
 - **Link with a closing keyword** — use `Closes #123`, `Fixes #123`, or `Resolves #123` in the PR body. The CI check will fail and the PR will be auto-closed if no valid issue reference is found.
 - **One concern per PR** — bug fixes, enhancements, and features must be separate PRs
 - **No drive-by formatting** — don't reformat code unrelated to your change
- **CI must pass** — all matrix jobs (Ubuntu, macOS, Windows × Node 22, 24) must be green
+- **CI must pass** — all matrix jobs (Ubuntu × Node 22, 24; macOS × Node 24) must be green
 - **Scope matches the approved issue** — if your PR does more than what the issue describes, the extra changes will be asked to be removed or moved to a new issue

 ## Testing Standards
@@ -231,25 +231,25 @@ const content = `

 ### Node.js Version Compatibility

-**Node 24 is the primary CI target.** All tests must pass on Node 24. Node 22 (LTS) must remain backward-compatible — do not use APIs that are not available in Node 22.
+**Node 22 is the minimum supported version.** Node 24 is the primary CI target. All tests must pass on both.

 | Version | Status |
 |---------|--------|
-| **Node 24** | Primary CI target — all tests must pass |
-| **Node 22** | Backward compatibility required |
+| **Node 22** | Minimum required — Active LTS until October 2026, Maintenance LTS until April 2027 |
+| **Node 24** | Primary CI target — current Active LTS, all tests must pass |
 | Node 26 | Forward-compatible target — avoid deprecated APIs |

 Do not use:
 - Deprecated APIs
- Version-specific features not available in Node 22
+- APIs not available in Node 22

 Safe to use:
- `node:test` — stable since Node 18, fully featured in 22+
+- `node:test` — stable since Node 18, fully featured in 24
 - `describe`/`it`/`test` — all supported
 - `beforeEach`/`afterEach`/`before`/`after` — all supported
- `t.after()` — per-test cleanup, available in Node 22+
- `t.plan()` — available since Node 22.2
- Snapshot testing — available since Node 22.3
+- `t.after()` — per-test cleanup
+- `t.plan()` — fully supported
+- Snapshot testing — fully supported

 ### Assertions

--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@

 **English** · [Português](README.pt-BR.md) · [简体中文](README.zh-CN.md) · [日本語](README.ja-JP.md) · [한국어](README.ko-KR.md)

-**A light-weight and powerful meta-prompting, context engineering and spec-driven development system for Claude Code, OpenCode, Gemini CLI, Kilo, Codex, Copilot, Cursor, Windsurf, Antigravity, Augment, Trae, and Cline.**
+**A light-weight and powerful meta-prompting, context engineering and spec-driven development system for Claude Code, OpenCode, Gemini CLI, Kilo, Codex, Copilot, Cursor, Windsurf, Antigravity, Augment, Trae, CodeBuddy, and Cline.**

 **Solves context rot — the quality degradation that happens as Claude fills its context window.**

@@ -89,13 +89,13 @@ People who want to describe what they want and have it built correctly — witho

 Built-in quality gates catch real problems: schema drift detection flags ORM changes missing migrations, security enforcement anchors verification to threat models, and scope reduction detection prevents the planner from silently dropping your requirements.

-### v1.33.0 Highlights
+### v1.34.0 Highlights

- **9 bug fixes** — Frontmatter parser, cross-platform planning lock, model alias updates, prompt guard, Kilo/skill path replacement, and more
- **Shared behavioral references** — Consistent agent behavior via questioning, domain-probes, and UI-brand reference docs
- **CONFIG_DEFAULTS refactor** — Single source of truth for all config defaults, eliminating scattered hardcoded values
- **Test standardization** — Full migration to `node:assert/strict` and `t.after()` cleanup patterns
- **Typed contribution templates** — Separate Bug, Enhancement, and Feature workflows with approval gates
+- **Gates taxonomy** — 4 canonical gate types (pre-flight, revision, escalation, abort) wired into plan-checker and verifier agents
+- **Shell hooks fix** — `hooks/*.sh` files are now correctly included in the npm package, eliminating startup hook errors on fresh installs
+- **Post-merge hunk verification** — `reapply-patches` detects silently dropped hunks after three-way merge
+- **detectConfigDir fix** — Claude Code users no longer see false "update available" warnings when multiple runtimes are installed
+- **3 bug fixes** — Milestone backlog preservation, detectConfigDir priority, and npm package manifest

 ---

@@ -106,12 +106,12 @@ npx get-shit-done-cc@latest
 ```

 The installer prompts you to choose:
-1. **Runtime** — Claude Code, OpenCode, Gemini, Kilo, Codex, Copilot, Cursor, Windsurf, Antigravity, Augment, Trae, Cline, or all (interactive multi-select — pick multiple runtimes in a single install session)
+1. **Runtime** — Claude Code, OpenCode, Gemini, Kilo, Codex, Copilot, Cursor, Windsurf, Antigravity, Augment, Trae, CodeBuddy, Cline, or all (interactive multi-select — pick multiple runtimes in a single install session)
 2. **Location** — Global (all projects) or local (current project only)

 Verify with:
 - Claude Code / Gemini / Copilot / Antigravity: `/gsd-help`
- OpenCode / Kilo / Augment / Trae: `/gsd-help`
+- OpenCode / Kilo / Augment / Trae / CodeBuddy: `/gsd-help`
 - Codex: `$gsd-help`
 - Cline: GSD installs via `.clinerules` — verify by checking `.clinerules` exists

@@ -160,7 +160,7 @@ npx get-shit-done-cc --cursor --global      # Install to ~/.cursor/
 npx get-shit-done-cc --cursor --local       # Install to ./.cursor/

 # Windsurf
-npx get-shit-done-cc --windsurf --global    # Install to ~/.windsurf/
+npx get-shit-done-cc --windsurf --global    # Install to ~/.codeium/windsurf/
 npx get-shit-done-cc --windsurf --local     # Install to ./.windsurf/

 # Antigravity
@@ -175,6 +175,10 @@ npx get-shit-done-cc --augment --local      # Install to ./.augment/
 npx get-shit-done-cc --trae --global        # Install to ~/.trae/
 npx get-shit-done-cc --trae --local         # Install to ./.trae/

+# CodeBuddy
+npx get-shit-done-cc --codebuddy --global   # Install to ~/.codebuddy/
+npx get-shit-done-cc --codebuddy --local    # Install to ./.codebuddy/
+
 # Cline
 npx get-shit-done-cc --cline --global       # Install to ~/.cline/
 npx get-shit-done-cc --cline --local        # Install to ./.clinerules
@@ -184,7 +188,7 @@ npx get-shit-done-cc --all --global      # Install to all directories
 ```

 Use `--global` (`-g`) or `--local` (`-l`) to skip the location prompt.
-Use `--claude`, `--opencode`, `--gemini`, `--kilo`, `--codex`, `--copilot`, `--cursor`, `--windsurf`, `--antigravity`, `--augment`, `--trae`, `--cline`, or `--all` to skip the runtime prompt.
+Use `--claude`, `--opencode`, `--gemini`, `--kilo`, `--codex`, `--copilot`, `--cursor`, `--windsurf`, `--antigravity`, `--augment`, `--trae`, `--codebuddy`, `--cline`, or `--all` to skip the runtime prompt.
 Use `--sdk` to also install the GSD SDK CLI (`gsd-sdk`) for headless autonomous execution.

 </details>
@@ -846,6 +850,7 @@ npx get-shit-done-cc --windsurf --global --uninstall
 npx get-shit-done-cc --antigravity --global --uninstall
 npx get-shit-done-cc --augment --global --uninstall
 npx get-shit-done-cc --trae --global --uninstall
+npx get-shit-done-cc --codebuddy --global --uninstall
 npx get-shit-done-cc --cline --global --uninstall

 # Local installs (current project)
@@ -860,6 +865,7 @@ npx get-shit-done-cc --windsurf --local --uninstall
 npx get-shit-done-cc --antigravity --local --uninstall
 npx get-shit-done-cc --augment --local --uninstall
 npx get-shit-done-cc --trae --local --uninstall
+npx get-shit-done-cc --codebuddy --local --uninstall
 npx get-shit-done-cc --cline --local --uninstall
 ```

--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -4,7 +4,7 @@

 [English](README.md) · [Português](README.pt-BR.md) · **简体中文** · [日本語](README.ja-JP.md) · [한국어](README.ko-KR.md)

-**一个轻量但强大的元提示、上下文工程与规格驱动开发系统，适用于 Claude Code、OpenCode、Gemini CLI、Kilo、Codex、Copilot、Cursor、Windsurf、Antigravity、Augment、Trae 和 Cline。**
+**一个轻量但强大的元提示、上下文工程与规格驱动开发系统，适用于 Claude Code、OpenCode、Gemini CLI、Kilo、Codex、Copilot、Cursor、Windsurf、Antigravity、Augment、Trae、CodeBuddy 和 Cline。**

 **它解决的是 context rot：随着 Claude 的上下文窗口被填满，输出质量逐步劣化的问题。**

@@ -92,12 +92,12 @@ npx get-shit-done-cc@latest
 ```

 安装器会提示你选择：
-1. **运行时**：Claude Code、OpenCode、Gemini、Kilo、Codex、Copilot、Cursor、Windsurf、Antigravity、Augment、Trae、Cline，或全部
+1. **运行时**：Claude Code、OpenCode、Gemini、Kilo、Codex、Copilot、Cursor、Windsurf、Antigravity、Augment、Trae、CodeBuddy、Cline，或全部
 2. **安装位置**：全局（所有项目）或本地（仅当前项目）

 安装后可这样验证：
 - Claude Code / Gemini / Copilot / Antigravity：`/gsd-help`
- OpenCode / Kilo / Augment / Trae：`/gsd-help`
+- OpenCode / Kilo / Augment / Trae / CodeBuddy：`/gsd-help`
 - Codex：`$gsd-help`
 - Cline：GSD 通过 `.clinerules` 安装 — 检查 `.clinerules` 是否存在

@@ -157,6 +157,10 @@ npx get-shit-done-cc --augment --local      # 安装到 ./.augment/
 npx get-shit-done-cc --trae --global     # 安装到 ~/.trae/
 npx get-shit-done-cc --trae --local      # 安装到 ./.trae/

+# CodeBuddy
+npx get-shit-done-cc --codebuddy --global # 安装到 ~/.codebuddy/
+npx get-shit-done-cc --codebuddy --local  # 安装到 ./.codebuddy/
+
 # Cline
 npx get-shit-done-cc --cline --global       # 安装到 ~/.cline/
 npx get-shit-done-cc --cline --local        # 安装到 ./.clinerules
@@ -166,7 +170,7 @@ npx get-shit-done-cc --all --global      # 安装到所有目录
 ```

 使用 `--global`（`-g`）或 `--local`（`-l`）可以跳过安装位置提示。
-使用 `--claude`、`--opencode`、`--gemini`、`--kilo`、`--codex`、`--copilot`、`--cursor`、`--windsurf`、`--antigravity`、`--augment`、`--trae`、`--cline` 或 `--all` 可以跳过运行时提示。
+使用 `--claude`、`--opencode`、`--gemini`、`--kilo`、`--codex`、`--copilot`、`--cursor`、`--windsurf`、`--antigravity`、`--augment`、`--trae`、`--codebuddy`、`--cline` 或 `--all` 可以跳过运行时提示。

 </details>

--- a/agents/gsd-ai-researcher.md
+++ b/agents/gsd-ai-researcher.md
@@ -0,0 +1,110 @@
+---
+name: gsd-ai-researcher
+description: Researches a chosen AI framework's official docs to produce implementation-ready guidance — best practices, syntax, core patterns, and pitfalls distilled for the specific use case. Writes the Framework Quick Reference and Implementation Guidance sections of AI-SPEC.md. Spawned by /gsd-ai-integration-phase orchestrator.
+tools: Read, Write, Bash, Grep, Glob, WebFetch, WebSearch, mcp__context7__*
+color: "#34D399"
+# hooks:
+#   PostToolUse:
+#     - matcher: "Write|Edit"
+#       hooks:
+#         - type: command
+#           command: "echo 'AI-SPEC written' 2>/dev/null || true"
+---
+
+<role>
+You are a GSD AI researcher. Answer: "How do I correctly implement this AI system with the chosen framework?"
+Write Sections 3–4b of AI-SPEC.md: framework quick reference, implementation guidance, and AI systems best practices.
+</role>
+
+<required_reading>
+Read `~/.claude/get-shit-done/references/ai-frameworks.md` for framework profiles and known pitfalls before fetching docs.
+</required_reading>
+
+<input>
+- `framework`: selected framework name and version
+- `system_type`: RAG | Multi-Agent | Conversational | Extraction | Autonomous | Content | Code | Hybrid
+- `model_provider`: OpenAI | Anthropic | Model-agnostic
+- `ai_spec_path`: path to AI-SPEC.md
+- `phase_context`: phase name and goal
+- `context_path`: path to CONTEXT.md if it exists
+
+**If prompt contains `<files_to_read>`, read every listed file before doing anything else.**
+</input>
+
+<documentation_sources>
+Use context7 MCP first (fastest). Fall back to WebFetch.
+
+| Framework | Official Docs URL |
+|-----------|------------------|
+| CrewAI | https://docs.crewai.com |
+| LlamaIndex | https://docs.llamaindex.ai |
+| LangChain | https://python.langchain.com/docs |
+| LangGraph | https://langchain-ai.github.io/langgraph |
+| OpenAI Agents SDK | https://openai.github.io/openai-agents-python |
+| Claude Agent SDK | https://docs.anthropic.com/en/docs/claude-code/sdk |
+| AutoGen / AG2 | https://ag2ai.github.io/ag2 |
+| Google ADK | https://google.github.io/adk-docs |
+| Haystack | https://docs.haystack.deepset.ai |
+</documentation_sources>
+
+<execution_flow>
+
+<step name="fetch_docs">
+Fetch 2-4 pages maximum — prioritize depth over breadth: quickstart, the `system_type`-specific pattern page, best practices/pitfalls.
+Extract: installation command, key imports, minimal entry point for `system_type`, 3-5 abstractions, 3-5 pitfalls (prefer GitHub issues over docs), folder structure.
+</step>
+
+<step name="detect_integrations">
+Based on `system_type` and `model_provider`, identify required supporting libraries: vector DB (RAG), embedding model, tracing tool, eval library.
+Fetch brief setup docs for each.
+</step>
+
+<step name="write_sections_3_4">
+**ALWAYS use the Write tool to create files** — never use `Bash(cat << 'EOF')` or heredoc commands for file creation.
+
+Update AI-SPEC.md at `ai_spec_path`:
+
+**Section 3 — Framework Quick Reference:** real installation command, actual imports, working entry point pattern for `system_type`, abstractions table (3-5 rows), pitfall list with why-it's-a-pitfall notes, folder structure, Sources subsection with URLs.
+
+**Section 4 — Implementation Guidance:** specific model (e.g., `claude-sonnet-4-6`, `gpt-4o`) with params, core pattern as code snippet with inline comments, tool use config, state management approach, context window strategy.
+</step>
+
+<step name="write_section_4b">
+Add **Section 4b — AI Systems Best Practices** to AI-SPEC.md. Always included, independent of framework choice.
+
+**4b.1 Structured Outputs with Pydantic** — Define the output schema using a Pydantic model; LLM must validate or retry. Write for this specific `framework` + `system_type`:
+- Example Pydantic model for the use case
+- How the framework integrates (LangChain `.with_structured_output()`, `instructor` for direct API, LlamaIndex `PydanticOutputParser`, OpenAI `response_format`)
+- Retry logic: how many retries, what to log, when to surface
+
+**4b.2 Async-First Design** — Cover: how async works in this framework; the one common mistake (e.g., `asyncio.run()` in an event loop); stream vs. await (stream for UX, await for structured output validation).
+
+**4b.3 Prompt Engineering Discipline** — System vs. user prompt separation; few-shot: inline vs. dynamic retrieval; set `max_tokens` explicitly, never leave unbounded in production.
+
+**4b.4 Context Window Management** — RAG: reranking/truncation when context exceeds window. Multi-agent/Conversational: summarisation patterns. Autonomous: framework compaction handling.
+
+**4b.5 Cost and Latency Budget** — Per-call cost estimate at expected volume; exact-match + semantic caching; cheaper models for sub-tasks (classification, routing, summarisation).
+</step>
+
+</execution_flow>
+
+<quality_standards>
+- All code snippets syntactically correct for the fetched version
+- Imports match actual package structure (not approximate)
+- Pitfalls specific — "use async where supported" is useless
+- Entry point pattern is copy-paste runnable
+- No hallucinated API methods — note "verify in docs" if unsure
+- Section 4b examples specific to `framework` + `system_type`, not generic
+</quality_standards>
+
+<success_criteria>
+- [ ] Official docs fetched (2-4 pages, not just homepage)
+- [ ] Installation command correct for latest stable version
+- [ ] Entry point pattern runs for `system_type`
+- [ ] 3-5 abstractions in context of use case
+- [ ] 3-5 specific pitfalls with explanations
+- [ ] Sections 3 and 4 written and non-empty
+- [ ] Section 4b: Pydantic example for this framework + system_type
+- [ ] Section 4b: async pattern, prompt discipline, context management, cost budget
+- [ ] Sources listed in Section 3
+</success_criteria>
--- a/agents/gsd-debugger.md
+++ b/agents/gsd-debugger.md
@@ -31,6 +31,10 @@ If the prompt contains a `<files_to_read>` block, you MUST use the `Read` tool t
 - Handle checkpoints when user input is unavoidable
 </role>

+<required_reading>
+@~/.claude/get-shit-done/references/common-bug-patterns.md
+</required_reading>
+
 <philosophy>

 ## User = Reporter, Claude = Investigator
--- a/agents/gsd-domain-researcher.md
+++ b/agents/gsd-domain-researcher.md
@@ -0,0 +1,130 @@
+---
+name: gsd-domain-researcher
+description: Researches the business domain and real-world application context of the AI system being built. Surfaces domain expert evaluation criteria, industry-specific failure modes, regulatory context, and what "good" looks like for practitioners in this field — before the eval-planner turns it into measurable rubrics. Spawned by /gsd-ai-integration-phase orchestrator.
+tools: Read, Write, Bash, Grep, Glob, WebSearch, WebFetch, mcp__context7__*
+color: "#A78BFA"
+# hooks:
+#   PostToolUse:
+#     - matcher: "Write|Edit"
+#       hooks:
+#         - type: command
+#           command: "echo 'AI-SPEC domain section written' 2>/dev/null || true"
+---
+
+<role>
+You are a GSD domain researcher. Answer: "What do domain experts actually care about when evaluating this AI system?"
+Research the business domain — not the technical framework. Write Section 1b of AI-SPEC.md.
+</role>
+
+<required_reading>
+Read `~/.claude/get-shit-done/references/ai-evals.md` — specifically the rubric design and domain expert sections.
+</required_reading>
+
+<input>
+- `system_type`: RAG | Multi-Agent | Conversational | Extraction | Autonomous | Content | Code | Hybrid
+- `phase_name`, `phase_goal`: from ROADMAP.md
+- `ai_spec_path`: path to AI-SPEC.md (partially written)
+- `context_path`: path to CONTEXT.md if exists
+- `requirements_path`: path to REQUIREMENTS.md if exists
+
+**If prompt contains `<files_to_read>`, read every listed file before doing anything else.**
+</input>
+
+<execution_flow>
+
+<step name="extract_domain_signal">
+Read AI-SPEC.md, CONTEXT.md, REQUIREMENTS.md. Extract: industry vertical, user population, stakes level, output type.
+If domain is unclear, infer from phase name and goal — "contract review" → legal, "support ticket" → customer service, "medical intake" → healthcare.
+</step>
+
+<step name="research_domain">
+Run 2-3 targeted searches:
+- `"{domain} AI system evaluation criteria site:arxiv.org OR site:research.google"`
+- `"{domain} LLM failure modes production"`
+- `"{domain} AI compliance requirements {current_year}"`
+
+Extract: practitioner eval criteria (not generic "accuracy"), known failure modes from production deployments, directly relevant regulations (HIPAA, GDPR, FCA, etc.), domain expert roles.
+</step>
+
+<step name="synthesize_rubric_ingredients">
+Produce 3-5 domain-specific rubric building blocks. Format each as:
+
+```
+Dimension: {name in domain language, not AI jargon}
+Good (domain expert would accept): {specific description}
+Bad (domain expert would flag): {specific description}
+Stakes: Critical / High / Medium
+Source: {practitioner knowledge, regulation, or research}
+```
+
+Example:
+```
+Dimension: Citation precision
+Good: Response cites the specific clause, section number, and jurisdiction
+Bad: Response states a legal principle without citing a source
+Stakes: Critical
+Source: Legal professional standards — unsourced legal advice constitutes malpractice risk
+```
+</step>
+
+<step name="identify_domain_experts">
+Specify who should be involved in evaluation: dataset labeling, rubric calibration, edge case review, production sampling.
+If internal tooling with no regulated domain, "domain expert" = product owner or senior team practitioner.
+</step>
+
+<step name="write_section_1b">
+**ALWAYS use the Write tool to create files** — never use `Bash(cat << 'EOF')` or heredoc commands for file creation.
+
+Update AI-SPEC.md at `ai_spec_path`. Add/update Section 1b:
+
+```markdown
+## 1b. Domain Context
+
+**Industry Vertical:** {vertical}
+**User Population:** {who uses this}
+**Stakes Level:** Low | Medium | High | Critical
+**Output Consequence:** {what happens downstream when the AI output is acted on}
+
+### What Domain Experts Evaluate Against
+
+{3-5 rubric ingredients in Dimension/Good/Bad/Stakes/Source format}
+
+### Known Failure Modes in This Domain
+
+{2-4 domain-specific failure modes — not generic hallucination}
+
+### Regulatory / Compliance Context
+
+{Relevant constraints — or "None identified for this deployment context"}
+
+### Domain Expert Roles for Evaluation
+
+| Role | Responsibility in Eval |
+|------|----------------------|
+| {role} | Reference dataset labeling / rubric calibration / production sampling |
+
+### Research Sources
+- {sources used}
+```
+</step>
+
+</execution_flow>
+
+<quality_standards>
+- Rubric ingredients in practitioner language, not AI/ML jargon
+- Good/Bad specific enough that two domain experts would agree — not "accurate" or "helpful"
+- Regulatory context: only what is directly relevant — do not list every possible regulation
+- If domain genuinely unclear, write a minimal section noting what to clarify with domain experts
+- Do not fabricate criteria — only surface research or well-established practitioner knowledge
+</quality_standards>
+
+<success_criteria>
+- [ ] Domain signal extracted from phase artifacts
+- [ ] 2-3 targeted domain research queries run
+- [ ] 3-5 rubric ingredients written (Good/Bad/Stakes/Source format)
+- [ ] Known failure modes identified (domain-specific, not generic)
+- [ ] Regulatory/compliance context identified or noted as none
+- [ ] Domain expert roles specified
+- [ ] Section 1b of AI-SPEC.md written and non-empty
+- [ ] Research sources listed
+</success_criteria>
--- a/agents/gsd-eval-auditor.md
+++ b/agents/gsd-eval-auditor.md
@@ -0,0 +1,164 @@
+---
+name: gsd-eval-auditor
+description: Retroactive audit of an implemented AI phase's evaluation coverage. Checks implementation against the AI-SPEC.md evaluation plan. Scores each eval dimension as COVERED/PARTIAL/MISSING. Produces a scored EVAL-REVIEW.md with findings, gaps, and remediation guidance. Spawned by /gsd-eval-review orchestrator.
+tools: Read, Write, Bash, Grep, Glob
+color: "#EF4444"
+# hooks:
+#   PostToolUse:
+#     - matcher: "Write|Edit"
+#       hooks:
+#         - type: command
+#           command: "echo 'EVAL-REVIEW written' 2>/dev/null || true"
+---
+
+<role>
+You are a GSD eval auditor. Answer: "Did the implemented AI system actually deliver its planned evaluation strategy?"
+Scan the codebase, score each dimension COVERED/PARTIAL/MISSING, write EVAL-REVIEW.md.
+</role>
+
+<required_reading>
+Read `~/.claude/get-shit-done/references/ai-evals.md` before auditing. This is your scoring framework.
+</required_reading>
+
+<input>
+- `ai_spec_path`: path to AI-SPEC.md (planned eval strategy)
+- `summary_paths`: all SUMMARY.md files in the phase directory
+- `phase_dir`: phase directory path
+- `phase_number`, `phase_name`
+
+**If prompt contains `<files_to_read>`, read every listed file before doing anything else.**
+</input>
+
+<execution_flow>
+
+<step name="read_phase_artifacts">
+Read AI-SPEC.md (Sections 5, 6, 7), all SUMMARY.md files, and PLAN.md files.
+Extract from AI-SPEC.md: planned eval dimensions with rubrics, eval tooling, dataset spec, online guardrails, monitoring plan.
+</step>
+
+<step name="scan_codebase">
+```bash
+# Eval/test files
+find . \( -name "*.test.*" -o -name "*.spec.*" -o -name "test_*" -o -name "eval_*" \) \
+  -not -path "*/node_modules/*" -not -path "*/.git/*" 2>/dev/null | head -40
+
+# Tracing/observability setup
+grep -r "langfuse\|langsmith\|arize\|phoenix\|braintrust\|promptfoo" \
+  --include="*.py" --include="*.ts" --include="*.js" -l 2>/dev/null | head -20
+
+# Eval library imports
+grep -r "from ragas\|import ragas\|from langsmith\|BraintrustClient" \
+  --include="*.py" --include="*.ts" -l 2>/dev/null | head -20
+
+# Guardrail implementations
+grep -r "guardrail\|safety_check\|moderation\|content_filter" \
+  --include="*.py" --include="*.ts" --include="*.js" -l 2>/dev/null | head -20
+
+# Eval config files and reference dataset
+find . \( -name "promptfoo.yaml" -o -name "eval.config.*" -o -name "*.jsonl" -o -name "evals*.json" \) \
+  -not -path "*/node_modules/*" 2>/dev/null | head -10
+```
+</step>
+
+<step name="score_dimensions">
+For each dimension from AI-SPEC.md Section 5:
+
+| Status | Criteria |
+|--------|----------|
+| **COVERED** | Implementation exists, targets the rubric behavior, runs (automated or documented manual) |
+| **PARTIAL** | Exists but incomplete — missing rubric specificity, not automated, or has known gaps |
+| **MISSING** | No implementation found for this dimension |
+
+For PARTIAL and MISSING: record what was planned, what was found, and specific remediation to reach COVERED.
+</step>
+
+<step name="audit_infrastructure">
+Score 5 components (ok / partial / missing):
+- **Eval tooling**: installed and actually called (not just listed as a dependency)
+- **Reference dataset**: file exists and meets size/composition spec
+- **CI/CD integration**: eval command present in Makefile, GitHub Actions, etc.
+- **Online guardrails**: each planned guardrail implemented in the request path (not stubbed)
+- **Tracing**: tool configured and wrapping actual AI calls
+</step>
+
+<step name="calculate_scores">
+```
+coverage_score  = covered_count / total_dimensions × 100
+infra_score     = (tooling + dataset + cicd + guardrails + tracing) / 5 × 100
+overall_score   = (coverage_score × 0.6) + (infra_score × 0.4)
+```
+
+Verdict:
+- 80-100: **PRODUCTION READY** — deploy with monitoring
+- 60-79: **NEEDS WORK** — address CRITICAL gaps before production
+- 40-59: **SIGNIFICANT GAPS** — do not deploy
+- 0-39: **NOT IMPLEMENTED** — review AI-SPEC.md and implement
+</step>
+
+<step name="write_eval_review">
+**ALWAYS use the Write tool to create files** — never use `Bash(cat << 'EOF')` or heredoc commands for file creation.
+
+Write to `{phase_dir}/{padded_phase}-EVAL-REVIEW.md`:
+
+```markdown
+# EVAL-REVIEW — Phase {N}: {name}
+
+**Audit Date:** {date}
+**AI-SPEC Present:** Yes / No
+**Overall Score:** {score}/100
+**Verdict:** {PRODUCTION READY | NEEDS WORK | SIGNIFICANT GAPS | NOT IMPLEMENTED}
+
+## Dimension Coverage
+
+| Dimension | Status | Measurement | Finding |
+|-----------|--------|-------------|---------|
+| {dim} | COVERED/PARTIAL/MISSING | Code/LLM Judge/Human | {finding} |
+
+**Coverage Score:** {n}/{total} ({pct}%)
+
+## Infrastructure Audit
+
+| Component | Status | Finding |
+|-----------|--------|---------|
+| Eval tooling ({tool}) | Installed / Configured / Not found | |
+| Reference dataset | Present / Partial / Missing | |
+| CI/CD integration | Present / Missing | |
+| Online guardrails | Implemented / Partial / Missing | |
+| Tracing ({tool}) | Configured / Not configured | |
+
+**Infrastructure Score:** {score}/100
+
+## Critical Gaps
+
+{MISSING items with Critical severity only}
+
+## Remediation Plan
+
+### Must fix before production:
+{Ordered CRITICAL gaps with specific steps}
+
+### Should fix soon:
+{PARTIAL items with steps}
+
+### Nice to have:
+{Lower-priority MISSING items}
+
+## Files Found
+
+{Eval-related files discovered during scan}
+```
+</step>
+
+</execution_flow>
+
+<success_criteria>
+- [ ] AI-SPEC.md read (or noted as absent)
+- [ ] All SUMMARY.md files read
+- [ ] Codebase scanned (5 scan categories)
+- [ ] Every planned dimension scored (COVERED/PARTIAL/MISSING)
+- [ ] Infrastructure audit completed (5 components)
+- [ ] Coverage, infrastructure, and overall scores calculated
+- [ ] Verdict determined
+- [ ] EVAL-REVIEW.md written with all sections populated
+- [ ] Critical gaps identified and remediation is specific and actionable
+</success_criteria>
--- a/agents/gsd-eval-planner.md
+++ b/agents/gsd-eval-planner.md
@@ -0,0 +1,154 @@
+---
+name: gsd-eval-planner
+description: Designs a structured evaluation strategy for an AI phase. Identifies critical failure modes, selects eval dimensions with rubrics, recommends tooling, and specifies the reference dataset. Writes the Evaluation Strategy, Guardrails, and Production Monitoring sections of AI-SPEC.md. Spawned by /gsd-ai-integration-phase orchestrator.
+tools: Read, Write, Bash, Grep, Glob, AskUserQuestion
+color: "#F59E0B"
+# hooks:
+#   PostToolUse:
+#     - matcher: "Write|Edit"
+#       hooks:
+#         - type: command
+#           command: "echo 'AI-SPEC eval sections written' 2>/dev/null || true"
+---
+
+<role>
+You are a GSD eval planner. Answer: "How will we know this AI system is working correctly?"
+Turn domain rubric ingredients into measurable, tooled evaluation criteria. Write Sections 5–7 of AI-SPEC.md.
+</role>
+
+<required_reading>
+Read `~/.claude/get-shit-done/references/ai-evals.md` before planning. This is your evaluation framework.
+</required_reading>
+
+<input>
+- `system_type`: RAG | Multi-Agent | Conversational | Extraction | Autonomous | Content | Code | Hybrid
+- `framework`: selected framework
+- `model_provider`: OpenAI | Anthropic | Model-agnostic
+- `phase_name`, `phase_goal`: from ROADMAP.md
+- `ai_spec_path`: path to AI-SPEC.md
+- `context_path`: path to CONTEXT.md if exists
+- `requirements_path`: path to REQUIREMENTS.md if exists
+
+**If prompt contains `<files_to_read>`, read every listed file before doing anything else.**
+</input>
+
+<execution_flow>
+
+<step name="read_phase_context">
+Read AI-SPEC.md in full — Section 1 (failure modes), Section 1b (domain rubric ingredients from gsd-domain-researcher), Sections 3-4 (Pydantic patterns to inform testable criteria), Section 2 (framework for tooling defaults).
+Also read CONTEXT.md and REQUIREMENTS.md.
+The domain researcher has done the SME work — your job is to turn their rubric ingredients into measurable criteria, not re-derive domain context.
+</step>
+
+<step name="select_eval_dimensions">
+Map `system_type` to required dimensions from `ai-evals.md`:
+- **RAG**: context faithfulness, hallucination, answer relevance, retrieval precision, source citation
+- **Multi-Agent**: task decomposition, inter-agent handoff, goal completion, loop detection
+- **Conversational**: tone/style, safety, instruction following, escalation accuracy
+- **Extraction**: schema compliance, field accuracy, format validity
+- **Autonomous**: safety guardrails, tool use correctness, cost/token adherence, task completion
+- **Content**: factual accuracy, brand voice, tone, originality
+- **Code**: correctness, safety, test pass rate, instruction following
+
+Always include: **safety** (user-facing) and **task completion** (agentic).
+</step>
+
+<step name="write_rubrics">
+Start from domain rubric ingredients in Section 1b — these are your rubric starting points, not generic dimensions. Fall back to generic `ai-evals.md` dimensions only if Section 1b is sparse.
+
+Format each rubric as:
+> PASS: {specific acceptable behavior in domain language}
+> FAIL: {specific unacceptable behavior in domain language}
+> Measurement: Code / LLM Judge / Human
+
+Assign measurement approach per dimension:
+- **Code-based**: schema validation, required field presence, performance thresholds, regex checks
+- **LLM judge**: tone, reasoning quality, safety violation detection — requires calibration
+- **Human review**: edge cases, LLM judge calibration, high-stakes sampling
+
+Mark each dimension with priority: Critical / High / Medium.
+</step>
+
+<step name="select_eval_tooling">
+Detect first — scan for existing tools before defaulting:
+```bash
+grep -r "langfuse\|langsmith\|arize\|phoenix\|braintrust\|promptfoo\|ragas" \
+  --include="*.py" --include="*.ts" --include="*.toml" --include="*.json" \
+  -l 2>/dev/null | grep -v node_modules | head -10
+```
+
+If detected: use it as the tracing default.
+
+If nothing detected, apply opinionated defaults:
+| Concern | Default |
+|---------|---------|
+| Tracing / observability | **Arize Phoenix** — open-source, self-hostable, framework-agnostic via OpenTelemetry |
+| RAG eval metrics | **RAGAS** — faithfulness, answer relevance, context precision/recall |
+| Prompt regression / CI | **Promptfoo** — CLI-first, no platform account required |
+| LangChain/LangGraph | **LangSmith** — overrides Phoenix if already in that ecosystem |
+
+Include Phoenix setup in AI-SPEC.md:
+```python
+# pip install arize-phoenix opentelemetry-sdk
+import phoenix as px
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+
+px.launch_app()  # http://localhost:6006
+provider = TracerProvider()
+trace.set_tracer_provider(provider)
+# Instrument: LlamaIndexInstrumentor().instrument() / LangChainInstrumentor().instrument()
+```
+</step>
+
+<step name="specify_reference_dataset">
+Define: size (10 examples minimum, 20 for production), composition (critical paths, edge cases, failure modes, adversarial inputs), labeling approach (domain expert / LLM judge with calibration / automated), creation timeline (start during implementation, not after).
+</step>
+
+<step name="design_guardrails">
+For each critical failure mode, classify:
+- **Online guardrail** (catastrophic) → runs on every request, real-time, must be fast
+- **Offline flywheel** (quality signal) → sampled batch, feeds improvement loop
+
+Keep guardrails minimal — each adds latency.
+</step>
+
+<step name="write_sections_5_6_7">
+**ALWAYS use the Write tool to create files** — never use `Bash(cat << 'EOF')` or heredoc commands for file creation.
+
+Update AI-SPEC.md at `ai_spec_path`:
+- Section 5 (Evaluation Strategy): dimensions table with rubrics, tooling, dataset spec, CI/CD command
+- Section 6 (Guardrails): online guardrails table, offline flywheel table
+- Section 7 (Production Monitoring): tracing tool, key metrics, alert thresholds, sampling strategy
+
+If domain context is genuinely unclear after reading all artifacts, ask ONE question:
+```
+AskUserQuestion([{
+  question: "What is the primary domain/industry context for this AI system?",
+  header: "Domain Context",
+  multiSelect: false,
+  options: [
+    { label: "Internal developer tooling" },
+    { label: "Customer-facing (B2C)" },
+    { label: "Business tool (B2B)" },
+    { label: "Regulated industry (healthcare, finance, legal)" },
+    { label: "Research / experimental" }
+  ]
+}])
+```
+</step>
+
+</execution_flow>
+
+<success_criteria>
+- [ ] Critical failure modes confirmed (minimum 3)
+- [ ] Eval dimensions selected (minimum 3, appropriate to system type)
+- [ ] Each dimension has a concrete rubric (not a generic label)
+- [ ] Each dimension has a measurement approach (Code / LLM Judge / Human)
+- [ ] Eval tooling selected with install command
+- [ ] Reference dataset spec written (size + composition + labeling)
+- [ ] CI/CD eval integration command specified
+- [ ] Online guardrails defined (minimum 1 for user-facing systems)
+- [ ] Offline flywheel metrics defined
+- [ ] Sections 5, 6, 7 of AI-SPEC.md written and non-empty
+</success_criteria>
--- a/agents/gsd-executor.md
+++ b/agents/gsd-executor.md
@@ -98,6 +98,9 @@ grep -n "type=\"checkpoint" [plan-path]
 At execution decision points, apply structured reasoning:
@~/.claude/get-shit-done/references/thinking-models-execution.md

+**iOS app scaffolding:** If this plan creates an iOS app target, follow ios-scaffold guidance:
+@~/.claude/get-shit-done/references/ios-scaffold.md
+
 For each task:

 1. **If `type="auto"`:**
@@ -362,7 +365,16 @@ git commit -m "{type}({phase}-{plan}): {concise task description}
 - **Single-repo:** `TASK_COMMIT=$(git rev-parse --short HEAD)` — track for SUMMARY.
 - **Multi-repo (sub_repos):** Extract hashes from `commit-to-subrepo` JSON output (`repos.{name}.hash`). Record all hashes for SUMMARY (e.g., `backend@abc1234, frontend@def5678`).

-**6. Check for untracked files:** After running scripts or tools, check `git status --short | grep '^??'`. For any new untracked files: commit if intentional, add to `.gitignore` if generated/runtime output. Never leave generated files untracked.
+**6. Post-commit deletion check:** After recording the hash, verify the commit did not accidentally delete tracked files:
+```bash
+DELETIONS=$(git diff --diff-filter=D --name-only HEAD~1 HEAD 2>/dev/null || true)
+if [ -n "$DELETIONS" ]; then
+  echo "WARNING: Commit includes file deletions: $DELETIONS"
+fi
+```
+Intentional deletions (e.g., removing a deprecated file as part of the task) are expected — document them in the Summary. Unexpected deletions are a Rule 1 bug: revert and fix before proceeding.
+
+**7. Check for untracked files:** After running scripts or tools, check `git status --short | grep '^??'`. For any new untracked files: commit if intentional, add to `.gitignore` if generated/runtime output. Never leave generated files untracked.
 </task_commit_protocol>

 <summary_creation>
--- a/agents/gsd-framework-selector.md
+++ b/agents/gsd-framework-selector.md
@@ -0,0 +1,160 @@
+---
+name: gsd-framework-selector
+description: Presents an interactive decision matrix to surface the right AI/LLM framework for the user's specific use case. Produces a scored recommendation with rationale. Spawned by /gsd-ai-integration-phase and /gsd-select-framework orchestrators.
+tools: Read, Bash, Grep, Glob, WebSearch, AskUserQuestion
+color: "#38BDF8"
+---
+
+<role>
+You are a GSD framework selector. Answer: "What AI/LLM framework is right for this project?"
+Run a ≤6-question interview, score frameworks, return a ranked recommendation to the orchestrator.
+</role>
+
+<required_reading>
+Read `~/.claude/get-shit-done/references/ai-frameworks.md` before asking questions. This is your decision matrix.
+</required_reading>
+
+<project_context>
+Scan for existing technology signals before the interview:
+```bash
+find . -maxdepth 2 \( -name "package.json" -o -name "pyproject.toml" -o -name "requirements*.txt" \) -not -path "*/node_modules/*" 2>/dev/null | head -5
+```
+Read found files to extract: existing AI libraries, model providers, language, team size signals. This prevents recommending a framework the team has already rejected.
+</project_context>
+
+<interview>
+Use a single AskUserQuestion call with ≤ 6 questions. Skip what the codebase scan or upstream CONTEXT.md already answers.
+
+```
+AskUserQuestion([
+  {
+    question: "What type of AI system are you building?",
+    header: "System Type",
+    multiSelect: false,
+    options: [
+      { label: "RAG / Document Q&A", description: "Answer questions from documents, PDFs, knowledge bases" },
+      { label: "Multi-Agent Workflow", description: "Multiple AI agents collaborating on structured tasks" },
+      { label: "Conversational Assistant / Chatbot", description: "Single-model chat interface with optional tool use" },
+      { label: "Structured Data Extraction", description: "Extract fields, entities, or structured output from unstructured text" },
+      { label: "Autonomous Task Agent", description: "Agent that plans and executes multi-step tasks independently" },
+      { label: "Content Generation Pipeline", description: "Generate text, summaries, drafts, or creative content at scale" },
+      { label: "Code Automation Agent", description: "Agent that reads, writes, or executes code autonomously" },
+      { label: "Not sure yet / Exploratory" }
+    ]
+  },
+  {
+    question: "Which model provider are you committing to?",
+    header: "Model Provider",
+    multiSelect: false,
+    options: [
+      { label: "OpenAI (GPT-4o, o3, etc.)", description: "Comfortable with OpenAI vendor lock-in" },
+      { label: "Anthropic (Claude)", description: "Comfortable with Anthropic vendor lock-in" },
+      { label: "Google (Gemini)", description: "Committed to Gemini / Google Cloud / Vertex AI" },
+      { label: "Model-agnostic", description: "Need ability to swap models or use local models" },
+      { label: "Undecided / Want flexibility" }
+    ]
+  },
+  {
+    question: "What is your development stage and team context?",
+    header: "Stage",
+    multiSelect: false,
+    options: [
+      { label: "Solo dev, rapid prototype", description: "Speed to working demo matters most" },
+      { label: "Small team (2-5), building toward production", description: "Balance speed and maintainability" },
+      { label: "Production system, needs fault tolerance", description: "Checkpointing, observability, and reliability required" },
+      { label: "Enterprise / regulated environment", description: "Audit trails, compliance, human-in-the-loop required" }
+    ]
+  },
+  {
+    question: "What programming language is this project using?",
+    header: "Language",
+    multiSelect: false,
+    options: [
+      { label: "Python", description: "Primary language is Python" },
+      { label: "TypeScript / JavaScript", description: "Node.js / frontend-adjacent stack" },
+      { label: "Both Python and TypeScript needed" },
+      { label: ".NET / C#", description: "Microsoft ecosystem" }
+    ]
+  },
+  {
+    question: "What is the most important requirement?",
+    header: "Priority",
+    multiSelect: false,
+    options: [
+      { label: "Fastest time to working prototype" },
+      { label: "Best retrieval/RAG quality" },
+      { label: "Most control over agent state and flow" },
+      { label: "Simplest API surface area (least abstraction)" },
+      { label: "Largest community and integrations" },
+      { label: "Safety and compliance first" }
+    ]
+  },
+  {
+    question: "Any hard constraints?",
+    header: "Constraints",
+    multiSelect: true,
+    options: [
+      { label: "No vendor lock-in" },
+      { label: "Must be open-source licensed" },
+      { label: "TypeScript required (no Python)" },
+      { label: "Must support local/self-hosted models" },
+      { label: "Enterprise SLA / support required" },
+      { label: "No new infrastructure (use existing DB)" },
+      { label: "None of the above" }
+    ]
+  }
+])
+```
+</interview>
+
+<scoring>
+Apply decision matrix from `ai-frameworks.md`:
+1. Eliminate frameworks failing any hard constraint
+2. Score remaining 1-5 on each answered dimension
+3. Weight by user's stated priority
+4. Produce ranked top 3 — show only the recommendation, not the scoring table
+</scoring>
+
+<output_format>
+Return to orchestrator:
+
+```
+FRAMEWORK_RECOMMENDATION:
+  primary: {framework name and version}
+  rationale: {2-3 sentences — why this fits their specific answers}
+  alternative: {second choice if primary doesn't work out}
+  alternative_reason: {1 sentence}
+  system_type: {RAG | Multi-Agent | Conversational | Extraction | Autonomous | Content | Code | Hybrid}
+  model_provider: {OpenAI | Anthropic | Model-agnostic}
+  eval_concerns: {comma-separated primary eval dimensions for this system type}
+  hard_constraints: {list of constraints}
+  existing_ecosystem: {detected libraries from codebase scan}
+```
+
+Display to user:
+
+```
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+ FRAMEWORK RECOMMENDATION
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+◆ Primary Pick: {framework}
+  {rationale}
+
+◆ Alternative: {alternative}
+  {alternative_reason}
+
+◆ System Type Classified: {system_type}
+◆ Key Eval Dimensions: {eval_concerns}
+```
+</output_format>
+
+<success_criteria>
+- [ ] Codebase scanned for existing framework signals
+- [ ] Interview completed (≤ 6 questions, single AskUserQuestion call)
+- [ ] Hard constraints applied to eliminate incompatible frameworks
+- [ ] Primary recommendation with clear rationale
+- [ ] Alternative identified
+- [ ] System type classified
+- [ ] Structured result returned to orchestrator
+</success_criteria>
--- a/agents/gsd-plan-checker.md
+++ b/agents/gsd-plan-checker.md
@@ -26,6 +26,12 @@ If the prompt contains a `<files_to_read>` block, you MUST use the `Read` tool t
 You are NOT the executor or verifier — you verify plans WILL work before execution burns context.
 </role>

+<required_reading>
+@~/.claude/get-shit-done/references/gates.md
+</required_reading>
+
+This agent implements the **Revision Gate** pattern (bounded quality loop with escalation on cap exhaustion).
+
 <project_context>
 Before verifying, discover project context:

--- a/agents/gsd-verifier.md
+++ b/agents/gsd-verifier.md
@@ -20,12 +20,15 @@ Your job: Goal-backward verification. Start from what the phase SHOULD deliver,
 If the prompt contains a `<files_to_read>` block, you MUST use the `Read` tool to load every file listed there before performing any other actions. This is your primary context.

 **Critical mindset:** Do NOT trust SUMMARY.md claims. SUMMARYs document what Claude SAID it did. You verify what ACTUALLY exists in the code. These often differ.
+
 </role>

 <required_reading>
@~/.claude/get-shit-done/references/verification-overrides.md
+@~/.claude/get-shit-done/references/gates.md
 </required_reading>

+This agent implements the **Escalation Gate** pattern (surfaces unresolvable gaps to the developer for decision).
 <project_context>
 Before verifying, discover project context:

--- a/bin/install.js
+++ b/bin/install.js
@@ -70,6 +70,8 @@ const hasCursor = args.includes('--cursor');
 const hasWindsurf = args.includes('--windsurf');
 const hasAugment = args.includes('--augment');
 const hasTrae = args.includes('--trae');
+const hasCodebuddy = args.includes('--codebuddy');
+const hasCline = args.includes('--cline');
 const hasBoth = args.includes('--both'); // Legacy flag, keeps working
 const hasAll = args.includes('--all');
 const hasUninstall = args.includes('--uninstall') || args.includes('-u');
@@ -77,7 +79,7 @@ const hasUninstall = args.includes('--uninstall') || args.includes('-u');
 // Runtime selection - can be set by flags or interactive prompt
 let selectedRuntimes = [];
 if (hasAll) {
-  selectedRuntimes = ['claude', 'kilo', 'opencode', 'gemini', 'codex', 'copilot', 'antigravity', 'cursor', 'windsurf', 'augment', 'trae'];
+  selectedRuntimes = ['claude', 'kilo', 'opencode', 'gemini', 'codex', 'copilot', 'antigravity', 'cursor', 'windsurf', 'augment', 'trae', 'codebuddy', 'cline'];
 } else if (hasBoth) {
  selectedRuntimes = ['claude', 'opencode'];
 } else {
@@ -92,6 +94,8 @@ if (hasAll) {
  if (hasWindsurf) selectedRuntimes.push('windsurf');
  if (hasAugment) selectedRuntimes.push('augment');
  if (hasTrae) selectedRuntimes.push('trae');
+  if (hasCodebuddy) selectedRuntimes.push('codebuddy');
+  if (hasCline) selectedRuntimes.push('cline');
 }

 // WSL + Windows Node.js detection
@@ -140,6 +144,8 @@ function getDirName(runtime) {
  if (runtime === 'windsurf') return '.windsurf';
  if (runtime === 'augment') return '.augment';
  if (runtime === 'trae') return '.trae';
+  if (runtime === 'codebuddy') return '.codebuddy';
+  if (runtime === 'cline') return '.cline';
  return '.claude';
 }

@@ -172,6 +178,8 @@ function getConfigDirFromHome(runtime, isGlobal) {
  if (runtime === 'windsurf') return "'.windsurf'";
  if (runtime === 'augment') return "'.augment'";
  if (runtime === 'trae') return "'.trae'";
+  if (runtime === 'codebuddy') return "'.codebuddy'";
+  if (runtime === 'cline') return "'.cline'";
  return "'.claude'";
 }

@@ -303,14 +311,14 @@ function getGlobalDir(runtime, explicitDir = null) {
  }

  if (runtime === 'windsurf') {
-    // Windsurf: --config-dir > WINDSURF_CONFIG_DIR > ~/.windsurf
+    // Windsurf: --config-dir > WINDSURF_CONFIG_DIR > ~/.codeium/windsurf
    if (explicitDir) {
      return expandTilde(explicitDir);
    }
    if (process.env.WINDSURF_CONFIG_DIR) {
      return expandTilde(process.env.WINDSURF_CONFIG_DIR);
    }
-    return path.join(os.homedir(), '.windsurf');
+    return path.join(os.homedir(), '.codeium', 'windsurf');
  }

  if (runtime === 'augment') {
@@ -334,6 +342,27 @@ function getGlobalDir(runtime, explicitDir = null) {
    return path.join(os.homedir(), '.trae');
  }

+  if (runtime === 'codebuddy') {
+    // CodeBuddy: --config-dir > CODEBUDDY_CONFIG_DIR > ~/.codebuddy
+    if (explicitDir) {
+      return expandTilde(explicitDir);
+    }
+    if (process.env.CODEBUDDY_CONFIG_DIR) {
+      return expandTilde(process.env.CODEBUDDY_CONFIG_DIR);
+    }
+    return path.join(os.homedir(), '.codebuddy');
+  }
+
+  if (runtime === 'cline') {
+    // Cline: --config-dir > CLINE_CONFIG_DIR > ~/.cline
+    if (explicitDir) {
+      return expandTilde(explicitDir);
+    }
+    if (process.env.CLINE_CONFIG_DIR) {
+      return expandTilde(process.env.CLINE_CONFIG_DIR);
+    }
+    return path.join(os.homedir(), '.cline');
+  }

  // Claude Code: --config-dir > CLAUDE_CONFIG_DIR > ~/.claude
  if (explicitDir) {
@@ -355,7 +384,7 @@ const banner = '\n' +
  '\n' +
  '  Get Shit Done ' + dim + 'v' + pkg.version + reset + '\n' +
  '  A meta-prompting, context engineering and spec-driven\n' +
-  '  development system for Claude Code, OpenCode, Gemini, Kilo, Codex, Copilot, Antigravity, Cursor, Windsurf, Augment and Trae by TÂCHES.\n';
+  '  development system for Claude Code, OpenCode, Gemini, Kilo, Codex, Copilot, Antigravity, Cursor, Windsurf, Augment, Trae, Cline and CodeBuddy by TÂCHES.\n';

 // Parse --config-dir argument
 function parseConfigDirArg() {
@@ -393,7 +422,7 @@ if (hasUninstall) {

 // Show help if requested
 if (hasHelp) {
-  console.log(`  ${yellow}Usage:${reset} npx get-shit-done-cc [options]\n\n  ${yellow}Options:${reset}\n    ${cyan}-g, --global${reset}              Install globally (to config directory)\n    ${cyan}-l, --local${reset}               Install locally (to current directory)\n    ${cyan}--claude${reset}                  Install for Claude Code only\n    ${cyan}--opencode${reset}                Install for OpenCode only\n    ${cyan}--gemini${reset}                  Install for Gemini only\n    ${cyan}--kilo${reset}                    Install for Kilo only\n    ${cyan}--codex${reset}                   Install for Codex only\n    ${cyan}--copilot${reset}                 Install for Copilot only\n    ${cyan}--antigravity${reset}             Install for Antigravity only\n    ${cyan}--cursor${reset}                  Install for Cursor only\n    ${cyan}--windsurf${reset}                Install for Windsurf only\n    ${cyan}--augment${reset}                 Install for Augment only\n    ${cyan}--trae${reset}                    Install for Trae only\n    ${cyan}--all${reset}                     Install for all runtimes\n    ${cyan}-u, --uninstall${reset}           Uninstall GSD (remove all GSD files)\n    ${cyan}-c, --config-dir <path>${reset}   Specify custom config directory\n    ${cyan}-h, --help${reset}                Show this help message\n    ${cyan}--force-statusline${reset}        Replace existing statusline config\n\n  ${yellow}Examples:${reset}\n    ${dim}# Interactive install (prompts for runtime and location)${reset}\n    npx get-shit-done-cc\n\n    ${dim}# Install for Claude Code globally${reset}\n    npx get-shit-done-cc --claude --global\n\n    ${dim}# Install for Gemini globally${reset}\n    npx get-shit-done-cc --gemini --global\n\n    ${dim}# Install for Kilo globally${reset}\n    npx get-shit-done-cc --kilo --global\n\n    ${dim}# Install for Codex globally${reset}\n    npx get-shit-done-cc --codex --global\n\n    ${dim}# Install for Copilot globally${reset}\n    npx get-shit-done-cc --copilot --global\n\n    ${dim}# Install for Copilot locally${reset}\n    npx get-shit-done-cc --copilot --local\n\n    ${dim}# Install for Antigravity globally${reset}\n    npx get-shit-done-cc --antigravity --global\n\n    ${dim}# Install for Antigravity locally${reset}\n    npx get-shit-done-cc --antigravity --local\n\n    ${dim}# Install for Cursor globally${reset}\n    npx get-shit-done-cc --cursor --global\n\n    ${dim}# Install for Cursor locally${reset}\n    npx get-shit-done-cc --cursor --local\n\n    ${dim}# Install for Windsurf globally${reset}\n    npx get-shit-done-cc --windsurf --global\n\n    ${dim}# Install for Windsurf locally${reset}\n    npx get-shit-done-cc --windsurf --local\n\n    ${dim}# Install for Augment globally${reset}\n    npx get-shit-done-cc --augment --global\n\n    ${dim}# Install for Augment locally${reset}\n    npx get-shit-done-cc --augment --local\n\n    ${dim}# Install for Trae globally${reset}\n    npx get-shit-done-cc --trae --global\n\n    ${dim}# Install for Trae locally${reset}\n    npx get-shit-done-cc --trae --local\n\n    ${dim}# Install for all runtimes globally${reset}\n    npx get-shit-done-cc --all --global\n\n    ${dim}# Install to custom config directory${reset}\n    npx get-shit-done-cc --kilo --global --config-dir ~/.kilo-work\n\n    ${dim}# Install to current project only${reset}\n    npx get-shit-done-cc --claude --local\n\n    ${dim}# Uninstall GSD from Cursor globally${reset}\n    npx get-shit-done-cc --cursor --global --uninstall\n\n  ${yellow}Notes:${reset}\n    The --config-dir option is useful when you have multiple configurations.\n    It takes priority over CLAUDE_CONFIG_DIR / OPENCODE_CONFIG_DIR / GEMINI_CONFIG_DIR / KILO_CONFIG_DIR / CODEX_HOME / COPILOT_CONFIG_DIR / ANTIGRAVITY_CONFIG_DIR / CURSOR_CONFIG_DIR / WINDSURF_CONFIG_DIR / AUGMENT_CONFIG_DIR / TRAE_CONFIG_DIR environment variables.\n`);
+  console.log(`  ${yellow}Usage:${reset} npx get-shit-done-cc [options]\n\n  ${yellow}Options:${reset}\n    ${cyan}-g, --global${reset}              Install globally (to config directory)\n    ${cyan}-l, --local${reset}               Install locally (to current directory)\n    ${cyan}--claude${reset}                  Install for Claude Code only\n    ${cyan}--opencode${reset}                Install for OpenCode only\n    ${cyan}--gemini${reset}                  Install for Gemini only\n    ${cyan}--kilo${reset}                    Install for Kilo only\n    ${cyan}--codex${reset}                   Install for Codex only\n    ${cyan}--copilot${reset}                 Install for Copilot only\n    ${cyan}--antigravity${reset}             Install for Antigravity only\n    ${cyan}--cursor${reset}                  Install for Cursor only\n    ${cyan}--windsurf${reset}                Install for Windsurf only\n    ${cyan}--augment${reset}                 Install for Augment only\n    ${cyan}--trae${reset}                    Install for Trae only\n    ${cyan}--cline${reset}                   Install for Cline only\n    ${cyan}--codebuddy${reset}              Install for CodeBuddy only\n    ${cyan}--all${reset}                     Install for all runtimes\n    ${cyan}-u, --uninstall${reset}           Uninstall GSD (remove all GSD files)\n    ${cyan}-c, --config-dir <path>${reset}   Specify custom config directory\n    ${cyan}-h, --help${reset}                Show this help message\n    ${cyan}--force-statusline${reset}        Replace existing statusline config\n\n  ${yellow}Examples:${reset}\n    ${dim}# Interactive install (prompts for runtime and location)${reset}\n    npx get-shit-done-cc\n\n    ${dim}# Install for Claude Code globally${reset}\n    npx get-shit-done-cc --claude --global\n\n    ${dim}# Install for Gemini globally${reset}\n    npx get-shit-done-cc --gemini --global\n\n    ${dim}# Install for Kilo globally${reset}\n    npx get-shit-done-cc --kilo --global\n\n    ${dim}# Install for Codex globally${reset}\n    npx get-shit-done-cc --codex --global\n\n    ${dim}# Install for Copilot globally${reset}\n    npx get-shit-done-cc --copilot --global\n\n    ${dim}# Install for Copilot locally${reset}\n    npx get-shit-done-cc --copilot --local\n\n    ${dim}# Install for Antigravity globally${reset}\n    npx get-shit-done-cc --antigravity --global\n\n    ${dim}# Install for Antigravity locally${reset}\n    npx get-shit-done-cc --antigravity --local\n\n    ${dim}# Install for Cursor globally${reset}\n    npx get-shit-done-cc --cursor --global\n\n    ${dim}# Install for Cursor locally${reset}\n    npx get-shit-done-cc --cursor --local\n\n    ${dim}# Install for Windsurf globally${reset}\n    npx get-shit-done-cc --windsurf --global\n\n    ${dim}# Install for Windsurf locally${reset}\n    npx get-shit-done-cc --windsurf --local\n\n    ${dim}# Install for Augment globally${reset}\n    npx get-shit-done-cc --augment --global\n\n    ${dim}# Install for Augment locally${reset}\n    npx get-shit-done-cc --augment --local\n\n    ${dim}# Install for Trae globally${reset}\n    npx get-shit-done-cc --trae --global\n\n    ${dim}# Install for Trae locally${reset}\n    npx get-shit-done-cc --trae --local\n\n    ${dim}# Install for Cline locally${reset}\n    npx get-shit-done-cc --cline --local\n\n    ${dim}# Install for CodeBuddy globally${reset}\n    npx get-shit-done-cc --codebuddy --global\n\n    ${dim}# Install for CodeBuddy locally${reset}\n    npx get-shit-done-cc --codebuddy --local\n\n    ${dim}# Install for all runtimes globally${reset}\n    npx get-shit-done-cc --all --global\n\n    ${dim}# Install to custom config directory${reset}\n    npx get-shit-done-cc --kilo --global --config-dir ~/.kilo-work\n\n    ${dim}# Install to current project only${reset}\n    npx get-shit-done-cc --claude --local\n\n    ${dim}# Uninstall GSD from Cursor globally${reset}\n    npx get-shit-done-cc --cursor --global --uninstall\n\n  ${yellow}Notes:${reset}\n    The --config-dir option is useful when you have multiple configurations.\n    It takes priority over CLAUDE_CONFIG_DIR / OPENCODE_CONFIG_DIR / GEMINI_CONFIG_DIR / KILO_CONFIG_DIR / CODEX_HOME / COPILOT_CONFIG_DIR / ANTIGRAVITY_CONFIG_DIR / CURSOR_CONFIG_DIR / WINDSURF_CONFIG_DIR / AUGMENT_CONFIG_DIR / TRAE_CONFIG_DIR / CLINE_CONFIG_DIR / CODEBUDDY_CONFIG_DIR environment variables.\n`);
  process.exit(0);
 }

@@ -414,7 +443,13 @@ function expandTilde(filePath) {
 function buildHookCommand(configDir, hookName) {
  // Use forward slashes for Node.js compatibility on all platforms
  const hooksPath = configDir.replace(/\\/g, '/') + '/hooks/' + hookName;
-  return `node "${hooksPath}"`;
+  // .sh hooks use bash; .js hooks use node. Both wrap the path in double quotes
+  // so that paths with spaces (e.g. Windows "C:/Users/First Last/") work correctly
+  // (fixes #2045). Routing .sh hooks through this function also ensures they always
+  // receive an absolute path rather than the bare relative string that the old manual
+  // concatenation produced (fixes #2046).
+  const runner = hookName.endsWith('.sh') ? 'bash' : 'node';
+  return `${runner} "${hooksPath}"`;
 }

 /**
@@ -1122,7 +1157,7 @@ function convertClaudeAgentToCursorAgent(content) {

 // --- Windsurf converters ---
 // Windsurf uses a tool set similar to Cursor.
-// Config lives in .windsurf/ (local) and ~/.windsurf/ (global).
+// Config lives in .windsurf/ (local) and ~/.codeium/windsurf/ (global).

 // Tool name mapping from Claude Code to Windsurf Cascade
 const claudeToWindsurfTools = {
@@ -1476,17 +1511,113 @@ function convertClaudeAgentToTraeAgent(content) {
  return `${cleanFrontmatter}\n${body}`;
 }

+function convertSlashCommandsToCodebuddySkillMentions(content) {
+  return content.replace(/\/gsd:([a-z0-9-]+)/g, (_, commandName) => {
+    return `/gsd-${commandName}`;
+  });
+}
+
+function convertClaudeToCodebuddyMarkdown(content) {
+  let converted = convertSlashCommandsToCodebuddySkillMentions(content);
+  // CodeBuddy uses the same tool names as Claude Code (Bash, Edit, Read, Write, etc.)
+  // No tool name conversion needed
+  converted = converted.replace(/\$ARGUMENTS\b/g, '{{GSD_ARGS}}');
+  converted = converted.replace(/`\.\/CLAUDE\.md`/g, '`CODEBUDDY.md`');
+  converted = converted.replace(/\.\/CLAUDE\.md/g, 'CODEBUDDY.md');
+  converted = converted.replace(/`CLAUDE\.md`/g, '`CODEBUDDY.md`');
+  converted = converted.replace(/\bCLAUDE\.md\b/g, 'CODEBUDDY.md');
+  converted = converted.replace(/\.claude\/skills\//g, '.codebuddy/skills/');
+  converted = converted.replace(/\.\/\.claude\//g, './.codebuddy/');
+  converted = converted.replace(/\.claude\//g, '.codebuddy/');
+  converted = converted.replace(/\*\*Known Claude Code bug \(classifyHandoffIfNeeded\):\*\*[^\n]*\n/g, '');
+  converted = converted.replace(/- \*\*classifyHandoffIfNeeded false failure:\*\*[^\n]*\n/g, '');
+  converted = converted.replace(/\bClaude Code\b/g, 'CodeBuddy');
+  return converted;
+}
+
+function convertClaudeCommandToCodebuddySkill(content, skillName) {
+  const converted = convertClaudeToCodebuddyMarkdown(content);
+  const { frontmatter, body } = extractFrontmatterAndBody(converted);
+  let description = `Run GSD workflow ${skillName}.`;
+  if (frontmatter) {
+    const maybeDescription = extractFrontmatterField(frontmatter, 'description');
+    if (maybeDescription) {
+      description = maybeDescription;
+    }
+  }
+  description = toSingleLine(description);
+  const shortDescription = description.length > 180 ? `${description.slice(0, 177)}...` : description;
+  return `---\nname: ${yamlIdentifier(skillName)}\ndescription: ${shortDescription}\n---\n${body}`;
+}
+
+function convertClaudeAgentToCodebuddyAgent(content) {
+  let converted = convertClaudeToCodebuddyMarkdown(content);
+
+  const { frontmatter, body } = extractFrontmatterAndBody(converted);
+  if (!frontmatter) return converted;
+
+  const name = extractFrontmatterField(frontmatter, 'name') || 'unknown';
+  const description = extractFrontmatterField(frontmatter, 'description') || '';
+
+  const cleanFrontmatter = `---\nname: ${yamlIdentifier(name)}\ndescription: ${yamlQuote(toSingleLine(description))}\n---`;
+
+  return `${cleanFrontmatter}\n${body}`;
+}
+
+// ── Cline converters ────────────────────────────────────────────────────────
+
+function convertClaudeToCliineMarkdown(content) {
+  let converted = content;
+  // Cline uses the same tool names as Claude Code — no tool name conversion needed
+  converted = converted.replace(/`\.\/CLAUDE\.md`/g, '`.clinerules`');
+  converted = converted.replace(/\.\/CLAUDE\.md/g, '.clinerules');
+  converted = converted.replace(/`CLAUDE\.md`/g, '`.clinerules`');
+  converted = converted.replace(/\bCLAUDE\.md\b/g, '.clinerules');
+  converted = converted.replace(/\.claude\/skills\//g, '.cline/skills/');
+  converted = converted.replace(/\.\/\.claude\//g, './.cline/');
+  converted = converted.replace(/\.claude\//g, '.cline/');
+  converted = converted.replace(/\*\*Known Claude Code bug \(classifyHandoffIfNeeded\):\*\*[^\n]*\n/g, '');
+  converted = converted.replace(/- \*\*classifyHandoffIfNeeded false failure:\*\*[^\n]*\n/g, '');
+  converted = converted.replace(/\bClaude Code\b/g, 'Cline');
+  return converted;
+}
+
+function convertClaudeAgentToClineAgent(content) {
+  let converted = convertClaudeToCliineMarkdown(content);
+  const { frontmatter, body } = extractFrontmatterAndBody(converted);
+  if (!frontmatter) return converted;
+  const name = extractFrontmatterField(frontmatter, 'name') || 'unknown';
+  const description = extractFrontmatterField(frontmatter, 'description') || '';
+  const cleanFrontmatter = `---\nname: ${yamlIdentifier(name)}\ndescription: ${yamlQuote(toSingleLine(description))}\n---`;
+  return `${cleanFrontmatter}\n${body}`;
+}
+
+// ── End Cline converters ─────────────────────────────────────────────────────
+
 function convertSlashCommandsToCodexSkillMentions(content) {
+  // Convert colon-style skill invocations to Codex $ prefix
  let converted = content.replace(/\/gsd:([a-z0-9-]+)/gi, (_, commandName) => {
    return `$gsd-${String(commandName).toLowerCase()}`;
  });
-  converted = converted.replace(/\/gsd-help\b/g, '$gsd-help');
+  // Convert hyphen-style command references (workflow output) to Codex $ prefix.
+  // Negative lookbehind excludes file paths like bin/gsd-tools.cjs where
+  // the slash is preceded by a word char, dot, or another slash.
+  converted = converted.replace(/(?<![a-zA-Z0-9./])\/gsd-([a-z0-9-]+)/gi, (_, commandName) => {
+    return `$gsd-${String(commandName).toLowerCase()}`;
+  });
  return converted;
 }

 function convertClaudeToCodexMarkdown(content) {
  let converted = convertSlashCommandsToCodexSkillMentions(content);
  converted = converted.replace(/\$ARGUMENTS\b/g, '{{GSD_ARGS}}');
+  // Remove /clear references — Codex has no equivalent command
+  // Handle backtick-wrapped: `\/clear` then: → (removed)
+  converted = converted.replace(/`\/clear`\s*,?\s*then:?\s*\n?/gi, '');
+  // Handle bare: /clear then: → (removed)
+  converted = converted.replace(/\/clear\s*,?\s*then:?\s*\n?/gi, '');
+  // Handle standalone /clear on its own line
+  converted = converted.replace(/^\s*`?\/clear`?\s*$/gm, '');
  // Path replacement: .claude → .codex (#1430)
  converted = converted.replace(/\$HOME\/\.claude\//g, '$HOME/.codex/');
  converted = converted.replace(/~\/\.claude\//g, '~/.codex/');
@@ -3572,7 +3703,7 @@ function copyCommandsAsWindsurfSkills(srcDir, skillsDir, prefix, pathPrefix, run
      const globalClaudeRegex = /~\/\.claude\//g;
      const globalClaudeHomeRegex = /\$HOME\/\.claude\//g;
      const localClaudeRegex = /\.\/\.claude\//g;
-      const windsurfDirRegex = /~\/\.windsurf\//g;
+      const windsurfDirRegex = /~\/\.codeium\/windsurf\//g;
      content = content.replace(globalClaudeRegex, pathPrefix);
      content = content.replace(globalClaudeHomeRegex, pathPrefix);
      content = content.replace(localClaudeRegex, `./${getDirName(runtime)}/`);
@@ -3646,6 +3777,69 @@ function copyCommandsAsTraeSkills(srcDir, skillsDir, prefix, pathPrefix, runtime
  recurse(srcDir, prefix);
 }

+/**
+ * Copy Claude commands as CodeBuddy skills — one folder per skill with SKILL.md.
+ * CodeBuddy uses the same tool names as Claude Code, but has its own config directory structure.
+ */
+function copyCommandsAsCodebuddySkills(srcDir, skillsDir, prefix, pathPrefix, runtime) {
+  if (!fs.existsSync(srcDir)) {
+    return;
+  }
+
+  fs.mkdirSync(skillsDir, { recursive: true });
+
+  const existing = fs.readdirSync(skillsDir, { withFileTypes: true });
+  for (const entry of existing) {
+    if (entry.isDirectory() && entry.name.startsWith(`${prefix}-`)) {
+      fs.rmSync(path.join(skillsDir, entry.name), { recursive: true });
+    }
+  }
+
+  function recurse(currentSrcDir, currentPrefix) {
+    const entries = fs.readdirSync(currentSrcDir, { withFileTypes: true });
+
+    for (const entry of entries) {
+      const srcPath = path.join(currentSrcDir, entry.name);
+      if (entry.isDirectory()) {
+        recurse(srcPath, `${currentPrefix}-${entry.name}`);
+        continue;
+      }
+
+      if (!entry.name.endsWith('.md')) {
+        continue;
+      }
+
+      const baseName = entry.name.replace('.md', '');
+      const skillName = `${currentPrefix}-${baseName}`;
+      const skillDir = path.join(skillsDir, skillName);
+      fs.mkdirSync(skillDir, { recursive: true });
+
+      let content = fs.readFileSync(srcPath, 'utf8');
+      const globalClaudeRegex = /~\/\.claude\//g;
+      const globalClaudeHomeRegex = /\$HOME\/\.claude\//g;
+      const localClaudeRegex = /\.\/\.claude\//g;
+      const bareGlobalClaudeRegex = /~\/\.claude\b/g;
+      const bareGlobalClaudeHomeRegex = /\$HOME\/\.claude\b/g;
+      const bareLocalClaudeRegex = /\.\/\.claude\b/g;
+      const codebuddyDirRegex = /~\/\.codebuddy\//g;
+      const normalizedPathPrefix = pathPrefix.replace(/\/$/, '');
+      content = content.replace(globalClaudeRegex, pathPrefix);
+      content = content.replace(globalClaudeHomeRegex, pathPrefix);
+      content = content.replace(localClaudeRegex, `./${getDirName(runtime)}/`);
+      content = content.replace(bareGlobalClaudeRegex, normalizedPathPrefix);
+      content = content.replace(bareGlobalClaudeHomeRegex, normalizedPathPrefix);
+      content = content.replace(bareLocalClaudeRegex, `./${getDirName(runtime)}`);
+      content = content.replace(codebuddyDirRegex, pathPrefix);
+      content = processAttribution(content, getCommitAttribution(runtime));
+      content = convertClaudeCommandToCodebuddySkill(content, skillName);
+
+      fs.writeFileSync(path.join(skillDir, 'SKILL.md'), content);
+    }
+  }
+
+  recurse(srcDir, prefix);
+}
+
 /**
 * Copy Claude commands as Copilot skills — one folder per skill with SKILL.md.
 * Applies CONV-01 (structure), CONV-02 (allowed-tools), CONV-06 (paths), CONV-07 (command names).
@@ -3809,6 +4003,42 @@ function copyCommandsAsAntigravitySkills(srcDir, skillsDir, prefix, isGlobal = f
  recurse(srcDir, prefix);
 }

+/**
+ * Save user-generated files from destDir to an in-memory map before a wipe.
+ *
+ * @param {string} destDir - Directory that is about to be wiped
+ * @param {string[]} fileNames - Relative file names (e.g. ['USER-PROFILE.md']) to preserve
+ * @returns {Map<string, string>} Map of fileName → file content (only entries that existed)
+ */
+function preserveUserArtifacts(destDir, fileNames) {
+  const saved = new Map();
+  for (const name of fileNames) {
+    const fullPath = path.join(destDir, name);
+    if (fs.existsSync(fullPath)) {
+      try {
+        saved.set(name, fs.readFileSync(fullPath, 'utf8'));
+      } catch { /* skip unreadable files */ }
+    }
+  }
+  return saved;
+}
+
+/**
+ * Restore user-generated files saved by preserveUserArtifacts after a wipe.
+ *
+ * @param {string} destDir - Directory that was wiped and recreated
+ * @param {Map<string, string>} saved - Map returned by preserveUserArtifacts
+ */
+function restoreUserArtifacts(destDir, saved) {
+  for (const [name, content] of saved) {
+    const fullPath = path.join(destDir, name);
+    try {
+      fs.mkdirSync(path.dirname(fullPath), { recursive: true });
+      fs.writeFileSync(fullPath, content, 'utf8');
+    } catch { /* skip unwritable paths */ }
+  }
+}
+
 /**
 * Recursively copy directory, replacing paths in .md files
 * Deletes existing destDir first to remove orphaned files from previous versions
@@ -3827,6 +4057,7 @@ function copyWithPathReplacement(srcDir, destDir, pathPrefix, runtime, isCommand
  const isWindsurf = runtime === 'windsurf';
  const isAugment = runtime === 'augment';
  const isTrae = runtime === 'trae';
+  const isCline = runtime === 'cline';
  const dirName = getDirName(runtime);

  // Clean install: remove existing destination to prevent orphaned files
@@ -3894,6 +4125,9 @@ function copyWithPathReplacement(srcDir, destDir, pathPrefix, runtime, isCommand
      } else if (isTrae) {
        content = convertClaudeToTraeMarkdown(content);
        fs.writeFileSync(destPath, content);
+      } else if (isCline) {
+        content = convertClaudeToCliineMarkdown(content);
+        fs.writeFileSync(destPath, content);
      } else {
        fs.writeFileSync(destPath, content);
      }
@@ -3932,6 +4166,12 @@ function copyWithPathReplacement(srcDir, destDir, pathPrefix, runtime, isCommand
      jsContent = jsContent.replace(/CLAUDE\.md/g, '.trae/rules/');
      jsContent = jsContent.replace(/\bClaude Code\b/g, 'Trae');
      fs.writeFileSync(destPath, jsContent);
+    } else if (isCline && (entry.name.endsWith('.cjs') || entry.name.endsWith('.js'))) {
+      let jsContent = fs.readFileSync(srcPath, 'utf8');
+      jsContent = jsContent.replace(/\.claude\/skills\//g, '.cline/skills/');
+      jsContent = jsContent.replace(/CLAUDE\.md/g, '.clinerules');
+      jsContent = jsContent.replace(/\bClaude Code\b/g, 'Cline');
+      fs.writeFileSync(destPath, jsContent);
    } else {
      fs.copyFileSync(srcPath, destPath);
    }
@@ -4109,6 +4349,7 @@ function uninstall(isGlobal, runtime = 'claude') {
  const isWindsurf = runtime === 'windsurf';
  const isAugment = runtime === 'augment';
  const isTrae = runtime === 'trae';
+  const isCodebuddy = runtime === 'codebuddy';
  const dirName = getDirName(runtime);

  // Get the target directory based on runtime and install type
@@ -4131,6 +4372,7 @@ function uninstall(isGlobal, runtime = 'claude') {
  if (runtime === 'windsurf') runtimeLabel = 'Windsurf';
  if (runtime === 'augment') runtimeLabel = 'Augment';
  if (runtime === 'trae') runtimeLabel = 'Trae';
+  if (runtime === 'codebuddy') runtimeLabel = 'CodeBuddy';

  console.log(`  Uninstalling GSD from ${cyan}${runtimeLabel}${reset} at ${cyan}${locationLabel}${reset}\n`);

@@ -4157,8 +4399,8 @@ function uninstall(isGlobal, runtime = 'claude') {
      }
      console.log(`  ${green}✓${reset} Removed GSD commands from command/`);
    }
-  } else if (isCodex || isCursor || isWindsurf || isTrae) {
-    // Codex/Cursor/Windsurf/Trae: remove skills/gsd-*/SKILL.md skill directories
+  } else if (isCodex || isCursor || isWindsurf || isTrae || isCodebuddy) {
+    // Codex/Cursor/Windsurf/Trae/CodeBuddy: remove skills/gsd-*/SKILL.md skill directories
    const skillsDir = path.join(targetDir, 'skills');
    if (fs.existsSync(skillsDir)) {
      let skillCount = 0;
@@ -4565,6 +4807,15 @@ function uninstall(isGlobal, runtime = 'claude') {
    }
  }

+  // Remove the file manifest that the installer wrote at install time.
+  // Without this step the metadata file persists after uninstall (#1908).
+  const manifestPath = path.join(targetDir, MANIFEST_NAME);
+  if (fs.existsSync(manifestPath)) {
+    fs.rmSync(manifestPath, { force: true });
+    removedCount++;
+    console.log(`  ${green}✓${reset} Removed ${MANIFEST_NAME}`);
+  }
+
  if (removedCount === 0) {
    console.log(`  ${yellow}⚠${reset} No GSD files found to remove.`);
  }
@@ -4876,6 +5127,7 @@ function writeManifest(configDir, runtime = 'claude') {
  const isCursor = runtime === 'cursor';
  const isWindsurf = runtime === 'windsurf';
  const isTrae = runtime === 'trae';
+  const isCline = runtime === 'cline';
  const gsdDir = path.join(configDir, 'get-shit-done');
  const commandsDir = path.join(configDir, 'commands', 'gsd');
  const opencodeCommandDir = path.join(configDir, 'command');
@@ -4916,9 +5168,17 @@ function writeManifest(configDir, runtime = 'claude') {
      }
    }
  }
+  // Track .clinerules file in manifest for Cline installs
+  if (isCline) {
+    const clinerulesDest = path.join(configDir, '.clinerules');
+    if (fs.existsSync(clinerulesDest)) {
+      manifest.files['.clinerules'] = fileHash(clinerulesDest);
+    }
+  }
+
  // Track hook files so saveLocalPatches() can detect user modifications
-  // Hooks are only installed for runtimes that use settings.json (not Codex/Copilot)
-  if (!isCodex && !isCopilot) {
+  // Hooks are only installed for runtimes that use settings.json (not Codex/Copilot/Cline)
+  if (!isCodex && !isCopilot && !isCline) {
    const hooksDir = path.join(configDir, 'hooks');
    if (fs.existsSync(hooksDir)) {
      for (const file of fs.readdirSync(hooksDir)) {
@@ -5038,13 +5298,19 @@ function install(isGlobal, runtime = 'claude') {
  const isWindsurf = runtime === 'windsurf';
  const isAugment = runtime === 'augment';
  const isTrae = runtime === 'trae';
+  const isCodebuddy = runtime === 'codebuddy';
+  const isCline = runtime === 'cline';
  const dirName = getDirName(runtime);
  const src = path.join(__dirname, '..');

-  // Get the target directory based on runtime and install type
+  // Get the target directory based on runtime and install type.
+  // Cline local installs write to the project root (like Claude Code) — .clinerules
+  // lives at the root, not inside a .cline/ subdirectory.
  const targetDir = isGlobal
    ? getGlobalDir(runtime, explicitConfigDir)
-    : path.join(process.cwd(), dirName);
+    : isCline
+      ? process.cwd()
+      : path.join(process.cwd(), dirName);

  const locationLabel = isGlobal
    ? targetDir.replace(os.homedir(), '~')
@@ -5072,6 +5338,8 @@ function install(isGlobal, runtime = 'claude') {
  if (isWindsurf) runtimeLabel = 'Windsurf';
  if (isAugment) runtimeLabel = 'Augment';
  if (isTrae) runtimeLabel = 'Trae';
+  if (isCodebuddy) runtimeLabel = 'CodeBuddy';
+  if (isCline) runtimeLabel = 'Cline';

  console.log(`  Installing for ${cyan}${runtimeLabel}${reset} to ${cyan}${locationLabel}${reset}\n`);

@@ -5179,6 +5447,20 @@ function install(isGlobal, runtime = 'claude') {
    } else {
      failures.push('skills/gsd-*');
    }
+  } else if (isCodebuddy) {
+    const skillsDir = path.join(targetDir, 'skills');
+    const gsdSrc = path.join(src, 'commands', 'gsd');
+    copyCommandsAsCodebuddySkills(gsdSrc, skillsDir, 'gsd', pathPrefix, runtime);
+    const installedSkillNames = listCodexSkillNames(skillsDir);
+    if (installedSkillNames.length > 0) {
+      console.log(`  ${green}✓${reset} Installed ${installedSkillNames.length} skills to skills/`);
+    } else {
+      failures.push('skills/gsd-*');
+    }
+  } else if (isCline) {
+    // Cline is rules-based — commands are embedded in .clinerules (generated below).
+    // No skills/commands directory needed. Engine is installed via copyWithPathReplacement.
+    console.log(`  ${green}✓${reset} Cline: commands will be available via .clinerules`);
  } else if (isGemini) {
    const commandsDir = path.join(targetDir, 'commands');
    fs.mkdirSync(commandsDir, { recursive: true });
@@ -5208,10 +5490,13 @@ function install(isGlobal, runtime = 'claude') {
    }

    // Clean up legacy commands/gsd/ from previous global installs
+    // Preserve user-generated files (dev-preferences.md) before wiping the directory
    const legacyCommandsDir = path.join(targetDir, 'commands', 'gsd');
    if (fs.existsSync(legacyCommandsDir)) {
+      const savedLegacyArtifacts = preserveUserArtifacts(legacyCommandsDir, ['dev-preferences.md']);
      fs.rmSync(legacyCommandsDir, { recursive: true });
      console.log(`  ${green}✓${reset} Removed legacy commands/gsd/ directory`);
+      restoreUserArtifacts(legacyCommandsDir, savedLegacyArtifacts);
    }
  } else {
    // Claude Code local: commands/gsd/ format — Claude Code reads local project
@@ -5243,9 +5528,12 @@ function install(isGlobal, runtime = 'claude') {
  }

  // Copy get-shit-done skill with path replacement
+  // Preserve user-generated files before the wipe-and-copy so they survive re-install
  const skillSrc = path.join(src, 'get-shit-done');
  const skillDest = path.join(targetDir, 'get-shit-done');
+  const savedGsdArtifacts = preserveUserArtifacts(skillDest, ['USER-PROFILE.md']);
  copyWithPathReplacement(skillSrc, skillDest, pathPrefix, runtime, false, isGlobal);
+  restoreUserArtifacts(skillDest, savedGsdArtifacts);
  if (verifyInstalled(skillDest, 'get-shit-done')) {
    console.log(`  ${green}✓${reset} Installed get-shit-done`);
  } else {
@@ -5306,6 +5594,10 @@ function install(isGlobal, runtime = 'claude') {
          content = convertClaudeAgentToAugmentAgent(content);
        } else if (isTrae) {
          content = convertClaudeAgentToTraeAgent(content);
+        } else if (isCodebuddy) {
+          content = convertClaudeAgentToCodebuddyAgent(content);
+        } else if (isCline) {
+          content = convertClaudeAgentToClineAgent(content);
        }
        const destName = isCopilot ? entry.name.replace('.md', '.agent.md') : entry.name;
        fs.writeFileSync(path.join(agentsDest, destName), content);
@@ -5339,7 +5631,7 @@ function install(isGlobal, runtime = 'claude') {
    failures.push('VERSION');
  }

-  if (!isCodex && !isCopilot && !isCursor && !isWindsurf && !isTrae) {
+  if (!isCodex && !isCopilot && !isCursor && !isWindsurf && !isTrae && !isCline) {
    // Write package.json to force CommonJS mode for GSD scripts
    // Prevents "require is not defined" errors when project has "type": "module"
    // Node.js walks up looking for package.json - this stops inheritance from project
@@ -5528,6 +5820,26 @@ function install(isGlobal, runtime = 'claude') {
    return { settingsPath: null, settings: null, statuslineCommand: null, runtime, configDir: targetDir };
  }

+  if (isCline) {
+    // Cline uses .clinerules — generate a rules file with GSD system instructions
+    const clinerulesDest = path.join(targetDir, '.clinerules');
+    const clinerules = [
+      '# GSD — Get Shit Done',
+      '',
+      '- GSD workflows live in `get-shit-done/workflows/`. Load the relevant workflow when',
+      '  the user runs a `/gsd-*` command.',
+      '- GSD agents live in `agents/`. Use the matching agent when spawning subagents.',
+      '- GSD tools are at `get-shit-done/bin/gsd-tools.cjs`. Run with `node`.',
+      '- Planning artifacts live in `.planning/`. Never edit them outside a GSD workflow.',
+      '- Do not apply GSD workflows unless the user explicitly asks for them.',
+      '- When a GSD command triggers a deliverable (feature, fix, docs), offer the next',
+      '  step to the user using Cline\'s ask_user tool after completing it.',
+    ].join('\n') + '\n';
+    fs.writeFileSync(clinerulesDest, clinerules);
+    console.log(`  ${green}✓${reset} Wrote .clinerules`);
+    return { settingsPath: null, settings: null, statuslineCommand: null, runtime, configDir: targetDir };
+  }
+
  // Configure statusline and hooks in settings.json
  // Gemini and Antigravity use AfterTool instead of PostToolUse for post-tool hooks
  const postToolEvent = (runtime === 'gemini' || runtime === 'antigravity') ? 'AfterTool' : 'PostToolUse';
@@ -5538,21 +5850,24 @@ function install(isGlobal, runtime = 'claude') {
    return;
  }
  const settings = validateHookFields(cleanupOrphanedHooks(rawSettings));
+  // Local installs anchor paths to $CLAUDE_PROJECT_DIR so hooks resolve
+  // correctly regardless of the shell's current working directory (#1906).
+  const localPrefix = '"$CLAUDE_PROJECT_DIR"/' + dirName;
  const statuslineCommand = isGlobal
    ? buildHookCommand(targetDir, 'gsd-statusline.js')
-    : 'node ' + dirName + '/hooks/gsd-statusline.js';
+    : 'node ' + localPrefix + '/hooks/gsd-statusline.js';
  const updateCheckCommand = isGlobal
    ? buildHookCommand(targetDir, 'gsd-check-update.js')
-    : 'node ' + dirName + '/hooks/gsd-check-update.js';
+    : 'node ' + localPrefix + '/hooks/gsd-check-update.js';
  const contextMonitorCommand = isGlobal
    ? buildHookCommand(targetDir, 'gsd-context-monitor.js')
-    : 'node ' + dirName + '/hooks/gsd-context-monitor.js';
+    : 'node ' + localPrefix + '/hooks/gsd-context-monitor.js';
  const promptGuardCommand = isGlobal
    ? buildHookCommand(targetDir, 'gsd-prompt-guard.js')
-    : 'node ' + dirName + '/hooks/gsd-prompt-guard.js';
+    : 'node ' + localPrefix + '/hooks/gsd-prompt-guard.js';
  const readGuardCommand = isGlobal
    ? buildHookCommand(targetDir, 'gsd-read-guard.js')
-    : 'node ' + dirName + '/hooks/gsd-read-guard.js';
+    : 'node ' + localPrefix + '/hooks/gsd-read-guard.js';

  // Enable experimental agents for Gemini CLI (required for custom sub-agents)
  if (isGemini) {
@@ -5578,7 +5893,12 @@ function install(isGlobal, runtime = 'claude') {
      entry.hooks && entry.hooks.some(h => h.command && h.command.includes('gsd-check-update'))
    );

-    if (!hasGsdUpdateHook) {
+    // Guard: only register if the hook file was actually installed (#1754).
+    // When hooks/dist/ is missing from the npm package (as in v1.32.0), the
+    // copy step produces no files but the registration step ran unconditionally,
+    // causing "hook error" on every tool invocation.
+    const checkUpdateFile = path.join(targetDir, 'hooks', 'gsd-check-update.js');
+    if (!hasGsdUpdateHook && fs.existsSync(checkUpdateFile)) {
      settings.hooks.SessionStart.push({
        hooks: [
          {
@@ -5588,6 +5908,8 @@ function install(isGlobal, runtime = 'claude') {
        ]
      });
      console.log(`  ${green}✓${reset} Configured update check hook`);
+    } else if (!hasGsdUpdateHook && !fs.existsSync(checkUpdateFile)) {
+      console.warn(`  ${yellow}⚠${reset}  Skipped update check hook — gsd-check-update.js not found at target`);
    }

    // Configure post-tool hook for context window monitoring
@@ -5599,7 +5921,8 @@ function install(isGlobal, runtime = 'claude') {
      entry.hooks && entry.hooks.some(h => h.command && h.command.includes('gsd-context-monitor'))
    );

-    if (!hasContextMonitorHook) {
+    const contextMonitorFile = path.join(targetDir, 'hooks', 'gsd-context-monitor.js');
+    if (!hasContextMonitorHook && fs.existsSync(contextMonitorFile)) {
      settings.hooks[postToolEvent].push({
        matcher: 'Bash|Edit|Write|MultiEdit|Agent|Task',
        hooks: [
@@ -5611,6 +5934,8 @@ function install(isGlobal, runtime = 'claude') {
        ]
      });
      console.log(`  ${green}✓${reset} Configured context window monitor hook`);
+    } else if (!hasContextMonitorHook && !fs.existsSync(contextMonitorFile)) {
+      console.warn(`  ${yellow}⚠${reset}  Skipped context monitor hook — gsd-context-monitor.js not found at target`);
    } else {
      // Migrate existing context monitor hooks: add matcher and timeout if missing
      for (const entry of settings.hooks[postToolEvent]) {
@@ -5644,7 +5969,8 @@ function install(isGlobal, runtime = 'claude') {
      entry.hooks && entry.hooks.some(h => h.command && h.command.includes('gsd-prompt-guard'))
    );

-    if (!hasPromptGuardHook) {
+    const promptGuardFile = path.join(targetDir, 'hooks', 'gsd-prompt-guard.js');
+    if (!hasPromptGuardHook && fs.existsSync(promptGuardFile)) {
      settings.hooks[preToolEvent].push({
        matcher: 'Write|Edit',
        hooks: [
@@ -5656,6 +5982,8 @@ function install(isGlobal, runtime = 'claude') {
        ]
      });
      console.log(`  ${green}✓${reset} Configured prompt injection guard hook`);
+    } else if (!hasPromptGuardHook && !fs.existsSync(promptGuardFile)) {
+      console.warn(`  ${yellow}⚠${reset}  Skipped prompt guard hook — gsd-prompt-guard.js not found at target`);
    }

    // Configure PreToolUse hook for read-before-edit guidance (#1628)
@@ -5665,7 +5993,8 @@ function install(isGlobal, runtime = 'claude') {
      entry.hooks && entry.hooks.some(h => h.command && h.command.includes('gsd-read-guard'))
    );

-    if (!hasReadGuardHook) {
+    const readGuardFile = path.join(targetDir, 'hooks', 'gsd-read-guard.js');
+    if (!hasReadGuardHook && fs.existsSync(readGuardFile)) {
      settings.hooks[preToolEvent].push({
        matcher: 'Write|Edit',
        hooks: [
@@ -5677,6 +6006,8 @@ function install(isGlobal, runtime = 'claude') {
        ]
      });
      console.log(`  ${green}✓${reset} Configured read-before-edit guard hook`);
+    } else if (!hasReadGuardHook && !fs.existsSync(readGuardFile)) {
+      console.warn(`  ${yellow}⚠${reset}  Skipped read guard hook — gsd-read-guard.js not found at target`);
    }

    // Community hooks — registered on install but opt-in at runtime.
@@ -5689,12 +6020,13 @@ function install(isGlobal, runtime = 'claude') {
    // /gsd-quick or /gsd-fast for state-tracked changes. Advisory only.
    const workflowGuardCommand = isGlobal
      ? buildHookCommand(targetDir, 'gsd-workflow-guard.js')
-      : 'node ' + dirName + '/hooks/gsd-workflow-guard.js';
+      : 'node ' + localPrefix + '/hooks/gsd-workflow-guard.js';
    const hasWorkflowGuardHook = settings.hooks[preToolEvent].some(entry =>
      entry.hooks && entry.hooks.some(h => h.command && h.command.includes('gsd-workflow-guard'))
    );

-    if (!hasWorkflowGuardHook) {
+    const workflowGuardFile = path.join(targetDir, 'hooks', 'gsd-workflow-guard.js');
+    if (!hasWorkflowGuardHook && fs.existsSync(workflowGuardFile)) {
      settings.hooks[preToolEvent].push({
        matcher: 'Write|Edit',
        hooks: [
@@ -5706,12 +6038,14 @@ function install(isGlobal, runtime = 'claude') {
        ]
      });
      console.log(`  ${green}✓${reset} Configured workflow guard hook (opt-in via hooks.workflow_guard)`);
+    } else if (!hasWorkflowGuardHook && !fs.existsSync(workflowGuardFile)) {
+      console.warn(`  ${yellow}⚠${reset}  Skipped workflow guard hook — gsd-workflow-guard.js not found at target`);
    }

    // Configure commit validation hook (Conventional Commits enforcement, opt-in)
    const validateCommitCommand = isGlobal
-      ? 'bash ' + targetDir.replace(/\\/g, '/') + '/hooks/gsd-validate-commit.sh'
-      : 'bash ' + dirName + '/hooks/gsd-validate-commit.sh';
+      ? buildHookCommand(targetDir, 'gsd-validate-commit.sh')
+      : 'bash ' + localPrefix + '/hooks/gsd-validate-commit.sh';
    const hasValidateCommitHook = settings.hooks[preToolEvent].some(entry =>
      entry.hooks && entry.hooks.some(h => h.command && h.command.includes('gsd-validate-commit'))
    );
@@ -5737,8 +6071,8 @@ function install(isGlobal, runtime = 'claude') {

    // Configure session state orientation hook (opt-in)
    const sessionStateCommand = isGlobal
-      ? 'bash ' + targetDir.replace(/\\/g, '/') + '/hooks/gsd-session-state.sh'
-      : 'bash ' + dirName + '/hooks/gsd-session-state.sh';
+      ? buildHookCommand(targetDir, 'gsd-session-state.sh')
+      : 'bash ' + localPrefix + '/hooks/gsd-session-state.sh';
    const hasSessionStateHook = settings.hooks.SessionStart.some(entry =>
      entry.hooks && entry.hooks.some(h => h.command && h.command.includes('gsd-session-state'))
    );
@@ -5759,8 +6093,8 @@ function install(isGlobal, runtime = 'claude') {

    // Configure phase boundary detection hook (opt-in)
    const phaseBoundaryCommand = isGlobal
-      ? 'bash ' + targetDir.replace(/\\/g, '/') + '/hooks/gsd-phase-boundary.sh'
-      : 'bash ' + dirName + '/hooks/gsd-phase-boundary.sh';
+      ? buildHookCommand(targetDir, 'gsd-phase-boundary.sh')
+      : 'bash ' + localPrefix + '/hooks/gsd-phase-boundary.sh';
    const hasPhaseBoundaryHook = settings.hooks[postToolEvent].some(entry =>
      entry.hooks && entry.hooks.some(h => h.command && h.command.includes('gsd-phase-boundary'))
    );
@@ -5796,6 +6130,7 @@ function finishInstall(settingsPath, settings, statuslineCommand, shouldInstallS
  const isCursor = runtime === 'cursor';
  const isWindsurf = runtime === 'windsurf';
  const isTrae = runtime === 'trae';
+  const isCline = runtime === 'cline';

  if (shouldInstallStatusline && !isOpencode && !isKilo && !isCodex && !isCopilot && !isCursor && !isWindsurf && !isTrae) {
    settings.statusLine = {
@@ -5806,7 +6141,7 @@ function finishInstall(settingsPath, settings, statuslineCommand, shouldInstallS
  }

  // Write settings when runtime supports settings.json
-  if (!isCodex && !isCopilot && !isKilo && !isCursor && !isWindsurf && !isTrae) {
+  if (!isCodex && !isCopilot && !isKilo && !isCursor && !isWindsurf && !isTrae && !isCline) {
    writeSettings(settingsPath, settings);
  }

@@ -5852,6 +6187,7 @@ function finishInstall(settingsPath, settings, statuslineCommand, shouldInstallS
  if (runtime === 'windsurf') program = 'Windsurf';
  if (runtime === 'augment') program = 'Augment';
  if (runtime === 'trae') program = 'Trae';
+  if (runtime === 'cline') program = 'Cline';

  let command = '/gsd-new-project';
  if (runtime === 'opencode') command = '/gsd-new-project';
@@ -5863,6 +6199,7 @@ function finishInstall(settingsPath, settings, statuslineCommand, shouldInstallS
  if (runtime === 'windsurf') command = '/gsd-new-project';
  if (runtime === 'augment') command = '/gsd-new-project';
  if (runtime === 'trae') command = '/gsd-new-project';
+  if (runtime === 'cline') command = '/gsd-new-project';
  console.log(`
  ${green}Done!${reset} Open a blank directory in ${program} and run ${cyan}${command}${reset}.

@@ -5944,29 +6281,33 @@ function promptRuntime(callback) {
    '1': 'claude',
    '2': 'antigravity',
    '3': 'augment',
-    '4': 'codex',
-    '5': 'copilot',
-    '6': 'cursor',
-    '7': 'gemini',
-    '8': 'kilo',
-    '9': 'opencode',
-    '10': 'trae',
-    '11': 'windsurf'
+    '4': 'cline',
+    '5': 'codebuddy',
+    '6': 'codex',
+    '7': 'copilot',
+    '8': 'cursor',
+    '9': 'gemini',
+    '10': 'kilo',
+    '11': 'opencode',
+    '12': 'trae',
+    '13': 'windsurf'
  };
-  const allRuntimes = ['claude', 'antigravity', 'augment', 'codex', 'copilot', 'cursor', 'gemini', 'kilo', 'opencode', 'trae', 'windsurf'];
+  const allRuntimes = ['claude', 'antigravity', 'augment', 'cline', 'codebuddy', 'codex', 'copilot', 'cursor', 'gemini', 'kilo', 'opencode', 'trae', 'windsurf'];

  console.log(`  ${yellow}Which runtime(s) would you like to install for?${reset}\n\n  ${cyan}1${reset}) Claude Code  ${dim}(~/.claude)${reset}
  ${cyan}2${reset}) Antigravity  ${dim}(~/.gemini/antigravity)${reset}
  ${cyan}3${reset}) Augment      ${dim}(~/.augment)${reset}
-  ${cyan}4${reset}) Codex        ${dim}(~/.codex)${reset}
-  ${cyan}5${reset}) Copilot      ${dim}(~/.copilot)${reset}
-  ${cyan}6${reset}) Cursor       ${dim}(~/.cursor)${reset}
-  ${cyan}7${reset}) Gemini       ${dim}(~/.gemini)${reset}
-  ${cyan}8${reset}) Kilo         ${dim}(~/.config/kilo)${reset}
-  ${cyan}9${reset}) OpenCode     ${dim}(~/.config/opencode)${reset}
-  ${cyan}10${reset}) Trae         ${dim}(~/.trae)${reset}
-  ${cyan}11${reset}) Windsurf     ${dim}(~/.windsurf)${reset}
-  ${cyan}12${reset}) All
+  ${cyan}4${reset}) Cline        ${dim}(.clinerules)${reset}
+  ${cyan}5${reset}) CodeBuddy    ${dim}(~/.codebuddy)${reset}
+  ${cyan}6${reset}) Codex        ${dim}(~/.codex)${reset}
+  ${cyan}7${reset}) Copilot      ${dim}(~/.copilot)${reset}
+  ${cyan}8${reset}) Cursor       ${dim}(~/.cursor)${reset}
+  ${cyan}9${reset}) Gemini       ${dim}(~/.gemini)${reset}
+  ${cyan}10${reset}) Kilo         ${dim}(~/.config/kilo)${reset}
+  ${cyan}11${reset}) OpenCode     ${dim}(~/.config/opencode)${reset}
+  ${cyan}12${reset}) Trae         ${dim}(~/.trae)${reset}
+  ${cyan}13${reset}) Windsurf     ${dim}(~/.codeium/windsurf)${reset}
+  ${cyan}14${reset}) All

  ${dim}Select multiple: 1,2,6 or 1 2 6${reset}
 `);
@@ -5977,7 +6318,7 @@ function promptRuntime(callback) {
    const input = answer.trim() || '1';

    // "All" shortcut
-    if (input === '12') {
+    if (input === '14') {
      callback(allRuntimes);
      return;
    }
@@ -6138,9 +6479,18 @@ if (process.env.GSD_TEST_MODE) {
    convertClaudeCommandToTraeSkill,
    convertClaudeAgentToTraeAgent,
    copyCommandsAsTraeSkills,
+    convertClaudeToCodebuddyMarkdown,
+    convertClaudeCommandToCodebuddySkill,
+    convertClaudeAgentToCodebuddyAgent,
+    copyCommandsAsCodebuddySkills,
+    convertClaudeToCliineMarkdown,
+    convertClaudeAgentToClineAgent,
    writeManifest,
    reportLocalPatches,
    validateHookFields,
+    preserveUserArtifacts,
+    restoreUserArtifacts,
+    finishInstall,
  };
 } else {

--- a/commands/gsd/ai-integration-phase.md
+++ b/commands/gsd/ai-integration-phase.md
@@ -0,0 +1,36 @@
+---
+name: gsd:ai-integration-phase
+description: Generate AI design contract (AI-SPEC.md) for phases that involve building AI systems — framework selection, implementation guidance from official docs, and evaluation strategy
+argument-hint: "[phase number]"
+allowed-tools:
+  - Read
+  - Write
+  - Bash
+  - Glob
+  - Grep
+  - Task
+  - WebFetch
+  - WebSearch
+  - AskUserQuestion
+  - mcp__context7__*
+---
+<objective>
+Create an AI design contract (AI-SPEC.md) for a phase involving AI system development.
+Orchestrates gsd-framework-selector → gsd-ai-researcher → gsd-domain-researcher → gsd-eval-planner.
+Flow: Select Framework → Research Docs → Research Domain → Design Eval Strategy → Done
+</objective>
+
+<execution_context>
+@~/.claude/get-shit-done/workflows/ai-integration-phase.md
+@~/.claude/get-shit-done/references/ai-frameworks.md
+@~/.claude/get-shit-done/references/ai-evals.md
+</execution_context>
+
+<context>
+Phase number: $ARGUMENTS — optional, auto-detects next unplanned phase if omitted.
+</context>
+
+<process>
+Execute @~/.claude/get-shit-done/workflows/ai-integration-phase.md end-to-end.
+Preserve all workflow gates.
+</process>
--- a/commands/gsd/autonomous.md
+++ b/commands/gsd/autonomous.md
@@ -10,6 +10,7 @@ allowed-tools:
  - Grep
  - AskUserQuestion
  - Task
+  - Agent
 ---
 <objective>
 Execute all remaining milestone phases autonomously. For each phase: discuss → plan → execute. Pauses only for user decisions (grey area acceptance, blockers, validation requests).
--- a/commands/gsd/eval-review.md
+++ b/commands/gsd/eval-review.md
@@ -0,0 +1,32 @@
+---
+name: gsd:eval-review
+description: Retroactively audit an executed AI phase's evaluation coverage — scores each eval dimension as COVERED/PARTIAL/MISSING and produces an actionable EVAL-REVIEW.md with remediation plan
+argument-hint: "[phase number]"
+allowed-tools:
+  - Read
+  - Write
+  - Bash
+  - Glob
+  - Grep
+  - Task
+  - AskUserQuestion
+---
+<objective>
+Conduct a retroactive evaluation coverage audit of a completed AI phase.
+Checks whether the evaluation strategy from AI-SPEC.md was implemented.
+Produces EVAL-REVIEW.md with score, verdict, gaps, and remediation plan.
+</objective>
+
+<execution_context>
+@~/.claude/get-shit-done/workflows/eval-review.md
+@~/.claude/get-shit-done/references/ai-evals.md
+</execution_context>
+
+<context>
+Phase: $ARGUMENTS — optional, defaults to last completed phase.
+</context>
+
+<process>
+Execute @~/.claude/get-shit-done/workflows/eval-review.md end-to-end.
+Preserve all workflow gates.
+</process>
--- a/commands/gsd/reapply-patches.md
+++ b/commands/gsd/reapply-patches.md
@@ -217,20 +217,74 @@ git -C "$CONFIG_DIR" log --oneline --no-merges -- "{file_path}" | grep -v "gsd:u
 Each matching commit represents an intentional user modification. Use the commit messages and diffs to understand what was changed and why.

 4. **Write merged result** to the installed location
-5. **Report status per file:**
+
+### Post-merge verification
+
+After writing each merged file, verify that user modifications survived the merge:
+
+1. **Line-count check:** Count lines in the backup and the merged result. If the merged result has fewer lines than the backup minus the expected upstream removals, flag for review.
+2. **Hunk presence check:** For each user-added section identified during diff analysis, search the merged output for at least the first significant line (non-blank, non-comment) of each addition. Missing signature lines indicate a dropped hunk.
+3. **Report warnings inline** (do not block):
+   ```
+   ⚠ Potential dropped content in {file_path}:
+     - Missing hunk near line {N}: "{first_line_preview}..." ({line_count} lines)
+     - Backup available: {patches_dir}/{file_path}
+   ```
+4. **Produce a Hunk Verification Table** — one row per hunk per file. This table is **mandatory output** and must be produced before Step 5 can proceed. Format:
+
+   | file | hunk_id | signature_line | line_count | verified |
+   |------|---------|----------------|------------|----------|
+   | {file_path} | {N} | {first_significant_line} | {count} | yes |
+   | {file_path} | {N} | {first_significant_line} | {count} | no |
+
+   - `hunk_id` — sequential integer per file (1, 2, 3…)
+   - `signature_line` — first non-blank, non-comment line of the user-added section
+   - `line_count` — total lines in the hunk
+   - `verified` — `yes` if the signature_line is present in the merged output, `no` otherwise
+
+5. **Track verification status** — add to per-file report: `Merged (verified)` vs `Merged (⚠ {N} hunks may be missing)`
+
+6. **Report status per file:**
   - `Merged` — user modifications applied cleanly (show summary of what was preserved)
   - `Conflict` — user reviewed and chose resolution
   - `Incorporated` — user's modification was already adopted upstream (only valid when pristine baseline confirms this)

 **Never report `Skipped — no custom content`.** If a file is in the backup, it has custom content.

-## Step 5: Cleanup option
+## Step 5: Hunk Verification Gate
+
+Before proceeding to cleanup, evaluate the Hunk Verification Table produced in Step 4.
+
+**If the Hunk Verification Table is absent** (Step 4 did not produce it), STOP immediately and report to the user:
+```
+ERROR: Hunk Verification Table is missing. Post-merge verification was not completed.
+Rerun /gsd-reapply-patches to retry with full verification.
+```
+
+**If any row in the Hunk Verification Table shows `verified: no`**, STOP and report to the user:
+```
+ERROR: {N} hunk(s) failed verification — content may have been dropped during merge.
+
+Unverified hunks:
+  {file} hunk {hunk_id}: signature line "{signature_line}" not found in merged output
+
+The backup is preserved at: {patches_dir}/{file}
+Review the merged file manually, then either:
+  (a) Re-merge the missing content by hand, or
+  (b) Restore from backup: cp {patches_dir}/{file} {installed_path}
+```
+
+Do not proceed to cleanup until the user confirms they have resolved all unverified hunks.
+
+**Only when all rows show `verified: yes`** (or when all files had zero user-added hunks) may execution continue to Step 6.
+
+## Step 6: Cleanup option

 Ask user:
 - "Keep patch backups for reference?" → preserve `gsd-local-patches/`
 - "Clean up patch backups?" → remove `gsd-local-patches/` directory

-## Step 6: Report
+## Step 7: Report

 ```
 ## Patches Reapplied
@@ -253,4 +307,5 @@ Ask user:
 - [ ] User modifications identified and merged into new version
 - [ ] Conflicts surfaced to user with both versions shown
 - [ ] Status reported for each file with summary of what was preserved
+- [ ] Post-merge verification checks each file for dropped hunks and warns if content appears missing
 </success_criteria>
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -113,7 +113,7 @@ User-facing entry points. Each file contains YAML frontmatter (name, description
 - **Copilot:** Slash commands (`/gsd-command-name`)
 - **Antigravity:** Skills

-**Total commands:** 60
+**Total commands:** 69

 ### Workflows (`get-shit-done/workflows/*.md`)

@@ -124,7 +124,7 @@ Orchestration logic that commands reference. Contains the step-by-step process i
 - State update patterns
 - Error handling and recovery

-**Total workflows:** 60
+**Total workflows:** 68

 ### Agents (`agents/*.md`)

@@ -134,23 +134,26 @@ Specialized agent definitions with frontmatter specifying:
 - `tools` — Allowed tool access (Read, Write, Edit, Bash, Grep, Glob, WebSearch, etc.)
 - `color` — Terminal output color for visual distinction

-**Total agents:** 21
+**Total agents:** 24

 ### References (`get-shit-done/references/*.md`)

-Shared knowledge documents that workflows and agents `@-reference` (25 total):
+Shared knowledge documents that workflows and agents `@-reference` (35 total):

 **Core references:**
 - `checkpoints.md` — Checkpoint type definitions and interaction patterns
+- `gates.md` — 4 canonical gate types (Confirm, Quality, Safety, Transition) wired into plan-checker and verifier
 - `model-profiles.md` — Per-agent model tier assignments
 - `model-profile-resolution.md` — Model resolution algorithm documentation
 - `verification-patterns.md` — How to verify different artifact types
+- `verification-overrides.md` — Per-artifact verification override rules
 - `planning-config.md` — Full config schema and behavior
 - `git-integration.md` — Git commit, branching, and history patterns
 - `git-planning-commit.md` — Planning directory commit conventions
 - `questioning.md` — Dream extraction philosophy for project initialization
 - `tdd.md` — Test-driven development integration patterns
 - `ui-brand.md` — Visual output formatting patterns
+- `common-bug-patterns.md` — Common bug patterns for code review and verification

 **Workflow references:**
 - `agent-contracts.md` — Formal interface between orchestrators and agents
@@ -165,6 +168,17 @@ Shared knowledge documents that workflows and agents `@-reference` (25 total):
 - `decimal-phase-calculation.md` — Decimal sub-phase numbering rules
 - `workstream-flag.md` — Workstream active pointer conventions
 - `user-profiling.md` — User behavioral profiling methodology
+- `thinking-partner.md` — Conditional thinking partner activation at decision points
+
+**Thinking model references:**
+
+References for integrating thinking-class models (o3, o4-mini, Gemini 2.5 Pro) into GSD workflows:
+
+- `thinking-models-debug.md` — Thinking model patterns for debugging workflows
+- `thinking-models-execution.md` — Thinking model patterns for execution agents
+- `thinking-models-planning.md` — Thinking model patterns for planning agents
+- `thinking-models-research.md` — Thinking model patterns for research agents
+- `thinking-models-verification.md` — Thinking model patterns for verification agents

 **Modular planner decomposition:**

@@ -395,14 +409,14 @@ UI-SPEC.md (per phase) ───────────────────

 ```
 ~/.claude/                          # Claude Code (global install)
-├── commands/gsd/*.md               # 60 slash commands
+├── commands/gsd/*.md               # 69 slash commands
 ├── get-shit-done/
 │   ├── bin/gsd-tools.cjs           # CLI utility
 │   ├── bin/lib/*.cjs               # 19 domain modules
-│   ├── workflows/*.md              # 60 workflow definitions
-│   ├── references/*.md             # 25 shared reference docs
+│   ├── workflows/*.md              # 68 workflow definitions
+│   ├── references/*.md             # 35 shared reference docs
 │   └── templates/                  # Planning artifact templates
-├── agents/*.md                     # 21 agent definitions
+├── agents/*.md                     # 24 agent definitions
 ├── hooks/
 │   ├── gsd-statusline.js           # Statusline hook
 │   ├── gsd-context-monitor.js      # Context warning hook
--- a/docs/COMMANDS.md
+++ b/docs/COMMANDS.md
@@ -542,6 +542,57 @@ Show all commands and usage guide.

 ## Utility Commands

+### `/gsd-explore`
+
+Socratic ideation session — guide an idea through probing questions, optionally spawn research, then route output to the right GSD artifact (notes, todos, seeds, research questions, requirements, or a new phase).
+
+| Argument | Required | Description |
+|----------|----------|-------------|
+| `topic` | No | Topic to explore (e.g., `/gsd-explore authentication strategy`) |
+
+```bash
+/gsd-explore                        # Open-ended ideation session
+/gsd-explore authentication strategy  # Explore a specific topic
+```
+
+---
+
+### `/gsd-undo`
+
+Safe git revert — roll back GSD phase or plan commits using the phase manifest with dependency checks and a confirmation gate.
+
+| Flag | Required | Description |
+|------|----------|-------------|
+| `--last N` | (one of three required) | Show recent GSD commits for interactive selection |
+| `--phase NN` | (one of three required) | Revert all commits for a phase |
+| `--plan NN-MM` | (one of three required) | Revert all commits for a specific plan |
+
+**Safety:** Checks dependent phases/plans before reverting; always shows a confirmation gate.
+
+```bash
+/gsd-undo --last 5                  # Pick from the 5 most recent GSD commits
+/gsd-undo --phase 03                # Revert all commits for phase 3
+/gsd-undo --plan 03-02              # Revert commits for plan 02 of phase 3
+```
+
+---
+
+### `/gsd-import`
+
+Ingest an external plan file into the GSD planning system with conflict detection against `PROJECT.md` decisions before writing anything.
+
+| Flag | Required | Description |
+|------|----------|-------------|
+| `--from <filepath>` | **Yes** | Path to the external plan file to import |
+
+**Process:** Detects conflicts → prompts for resolution → writes as GSD PLAN.md → validates via `gsd-plan-checker`
+
+```bash
+/gsd-import --from /tmp/team-plan.md  # Import and validate an external plan
+```
+
+---
+
 ### `/gsd-quick`

 Execute ad-hoc task with GSD guarantees.
@@ -809,6 +860,46 @@ Analyze existing codebase with parallel mapper agents.

 ---

+### `/gsd-scan`
+
+Rapid single-focus codebase assessment — lightweight alternative to `/gsd-map-codebase` that spawns one mapper agent instead of four parallel ones.
+
+| Flag | Description |
+|------|-------------|
+| `--focus tech\|arch\|quality\|concerns\|tech+arch` | Focus area (default: `tech+arch`) |
+
+**Produces:** Targeted document(s) in `.planning/codebase/`
+
+```bash
+/gsd-scan                           # Quick tech + arch overview
+/gsd-scan --focus quality           # Quality and code health only
+/gsd-scan --focus concerns          # Surface concerns and risk areas
+```
+
+---
+
+### `/gsd-intel`
+
+Query, inspect, or refresh queryable codebase intelligence files stored in `.planning/intel/`. Requires `intel.enabled: true` in `config.json`.
+
+| Argument | Description |
+|----------|-------------|
+| `query <term>` | Search intel files for a term |
+| `status` | Show intel file freshness (FRESH/STALE) |
+| `diff` | Show changes since last snapshot |
+| `refresh` | Rebuild all intel files from codebase analysis |
+
+**Produces:** `.planning/intel/` JSON files (stack, api-map, dependency-graph, file-roles, arch-decisions)
+
+```bash
+/gsd-intel status                   # Check freshness of intel files
+/gsd-intel query authentication     # Search intel for a term
+/gsd-intel diff                     # What changed since last snapshot
+/gsd-intel refresh                  # Rebuild intel index
+```
+
+---
+
 ## Update Commands

 ### `/gsd-update`
@@ -829,6 +920,75 @@ Restore local modifications after a GSD update.

 ---

+## Code Quality Commands
+
+### `/gsd-code-review`
+
+Review source files changed during a phase for bugs, security vulnerabilities, and code quality problems.
+
+| Argument | Required | Description |
+|----------|----------|-------------|
+| `N` | **Yes** | Phase number whose changes to review (e.g., `2` or `02`) |
+| `--depth=quick\|standard\|deep` | No | Review depth level (overrides `workflow.code_review_depth` config). `quick`: pattern-matching only (~2 min). `standard`: per-file analysis with language-specific checks (~5–15 min, default). `deep`: cross-file analysis including import graphs and call chains (~15–30 min) |
+| `--files file1,file2,...` | No | Explicit comma-separated file list; skips SUMMARY/git scoping entirely |
+
+**Prerequisites:** Phase has been executed and has SUMMARY.md or git history
+**Produces:** `{phase}-REVIEW.md` in phase directory with severity-classified findings
+**Spawns:** `gsd-code-reviewer` agent
+
+```bash
+/gsd-code-review 3                          # Standard review for phase 3
+/gsd-code-review 2 --depth=deep             # Deep cross-file review
+/gsd-code-review 4 --files src/auth.ts,src/token.ts  # Explicit file list
+```
+
+---
+
+### `/gsd-code-review-fix`
+
+Auto-fix issues found by `/gsd-code-review`. Reads `REVIEW.md`, spawns a fixer agent, commits each fix atomically, and produces a `REVIEW-FIX.md` summary.
+
+| Argument | Required | Description |
+|----------|----------|-------------|
+| `N` | **Yes** | Phase number whose REVIEW.md to fix |
+| `--all` | No | Include Info findings in fix scope (default: Critical + Warning only) |
+| `--auto` | No | Enable fix + re-review iteration loop, capped at 3 iterations |
+
+**Prerequisites:** Phase has a `{phase}-REVIEW.md` file (run `/gsd-code-review` first)
+**Produces:** `{phase}-REVIEW-FIX.md` with applied fixes summary
+**Spawns:** `gsd-code-fixer` agent
+
+```bash
+/gsd-code-review-fix 3                      # Fix Critical + Warning findings for phase 3
+/gsd-code-review-fix 3 --all               # Include Info findings
+/gsd-code-review-fix 3 --auto              # Fix and re-review until clean (max 3 iterations)
+```
+
+---
+
+### `/gsd-audit-fix`
+
+Autonomous audit-to-fix pipeline — runs an audit, classifies findings, fixes auto-fixable issues with test verification, and commits each fix atomically.
+
+| Flag | Description |
+|------|-------------|
+| `--source <audit>` | Which audit to run (default: `audit-uat`) |
+| `--severity high\|medium\|all` | Minimum severity to process (default: `medium`) |
+| `--max N` | Maximum findings to fix (default: 5) |
+| `--dry-run` | Classify findings without fixing (shows classification table) |
+
+**Prerequisites:** At least one phase has been executed with UAT or verification
+**Produces:** Fix commits with test verification; classification report
+
+```bash
+/gsd-audit-fix                              # Run audit-uat, fix medium+ issues (max 5)
+/gsd-audit-fix --severity high             # Only fix high-severity issues
+/gsd-audit-fix --dry-run                   # Preview classification without fixing
+/gsd-audit-fix --max 10 --severity all     # Fix up to 10 issues of any severity
+```
+
+---
+
 ## Fast & Inline Commands

 ### `/gsd-fast`
@@ -848,8 +1008,6 @@ Execute a trivial task inline — no subagents, no planning overhead. For typo f

 ---

-## Code Quality Commands
-
 ### `/gsd-review`

 Cross-AI peer review of phase plans from external AI CLIs.
--- a/docs/CONFIGURATION.md
+++ b/docs/CONFIGURATION.md
@@ -20,6 +20,7 @@ GSD stores project settings in `.planning/config.json`. Created during `/gsd-new
    "commit_docs": true,
    "search_gitignored": false
  },
+  "context_profile": null,
  "workflow": {
    "research": true,
    "plan_check": true,
@@ -34,7 +35,9 @@ GSD stores project settings in `.planning/config.json`. Created during `/gsd-new
    "discuss_mode": "discuss",
    "skip_discuss": false,
    "text_mode": false,
-    "use_worktrees": true
+    "use_worktrees": true,
+    "code_review": true,
+    "code_review_depth": "standard"
  },
  "hooks": {
    "context_warnings": true,
@@ -73,7 +76,17 @@ GSD stores project settings in `.planning/config.json`. Created during `/gsd-new
  "security_asvs_level": 1,
  "security_block_on": "high",
  "agent_skills": {},
-  "response_language": null
+  "response_language": null,
+  "features": {
+    "thinking_partner": false,
+    "global_learnings": false
+  },
+  "learnings": {
+    "max_inject": 10
+  },
+  "intel": {
+    "enabled": false
+  }
 }
 ```

@@ -88,6 +101,7 @@ GSD stores project settings in `.planning/config.json`. Created during `/gsd-new
 | `model_profile` | enum | `quality`, `balanced`, `budget`, `inherit` | `balanced` | Model tier for each agent (see [Model Profiles](#model-profiles)) |
 | `project_code` | string | any short string | (none) | Prefix for phase directory names (e.g., `"ABC"` produces `ABC-01-setup/`). Added in v1.31 |
 | `response_language` | string | language code | (none) | Language for agent responses (e.g., `"pt"`, `"ko"`, `"ja"`). Propagates to all spawned agents for cross-phase language consistency. Added in v1.32 |
+| `context_profile` | string | `dev`, `research`, `review` | (none) | Execution context preset that applies a pre-configured bundle of mode, model, and workflow settings for the current type of work. Added in v1.34 |

 > **Note:** `granularity` was renamed from `depth` in v1.22.3. Existing configs are auto-migrated.

@@ -113,6 +127,8 @@ All workflow toggles follow the **absent = enabled** pattern. If a key is missin
 | `workflow.skip_discuss` | boolean | `false` | When `true`, `/gsd-autonomous` bypasses the discuss-phase entirely, writing minimal CONTEXT.md from the ROADMAP phase goal. Useful for projects where developer preferences are fully captured in PROJECT.md/REQUIREMENTS.md. Added in v1.28 |
 | `workflow.text_mode` | boolean | `false` | Replaces AskUserQuestion TUI menus with plain-text numbered lists. Required for Claude Code remote sessions (`/rc` mode) where TUI menus don't render. Can also be set per-session with `--text` flag on discuss-phase. Added in v1.28 |
 | `workflow.use_worktrees` | boolean | `true` | When `false`, disables git worktree isolation for parallel execution. Users who prefer sequential execution or whose environment does not support worktrees can disable this. Added in v1.31 |
+| `workflow.code_review` | boolean | `true` | Enable `/gsd-code-review` and `/gsd-code-review-fix` commands. When `false`, the commands exit with a configuration gate message. Added in v1.34 |
+| `workflow.code_review_depth` | string | `standard` | Default review depth for `/gsd-code-review`: `quick` (pattern-matching only), `standard` (per-file analysis), or `deep` (cross-file with import graphs). Can be overridden per-run with `--depth=`. Added in v1.34 |

 ### Recommended Presets

@@ -230,6 +246,7 @@ Toggle optional capabilities via the `features.*` config namespace. Feature flag
 |---------|------|---------|-------------|
 | `features.thinking_partner` | boolean | `false` | Enable thinking partner analysis at workflow decision points |
 | `features.global_learnings` | boolean | `false` | Enable cross-project learnings pipeline (auto-copy at phase completion, planner injection) |
+| `intel.enabled` | boolean | `false` | Enable queryable codebase intelligence system. When `true`, `/gsd-intel` commands build and query a JSON index in `.planning/intel/`. Added in v1.34 |

 ### Usage

@@ -343,14 +360,6 @@ Settings for the security enforcement feature (v1.31). All follow the **absent =

 ---

-## Hook Settings
-
-| Setting | Type | Default | Description |
-|---------|------|---------|-------------|
-| `hooks.context_warnings` | boolean | `true` | Show context window usage warnings during sessions |
-
---
-
 ## Manager Passthrough Flags

 Configure per-step flags that `/gsd-manager` appends to each dispatched command. This allows customizing how the manager runs discuss, plan, and execute steps without manual flag entry.
--- a/docs/FEATURES.md
+++ b/docs/FEATURES.md
@@ -86,6 +86,22 @@
  - [Worktree Toggle](#66-worktree-toggle)
  - [Project Code Prefixing](#67-project-code-prefixing)
  - [Claude Code Skills Migration](#68-claude-code-skills-migration)
+- [v1.34.0 Features](#v1340-features)
+  - [Global Learnings Store](#89-global-learnings-store)
+  - [Queryable Codebase Intelligence](#90-queryable-codebase-intelligence)
+  - [Execution Context Profiles](#91-execution-context-profiles)
+  - [Gates Taxonomy](#92-gates-taxonomy)
+  - [Code Review Pipeline](#93-code-review-pipeline)
+  - [Socratic Exploration](#94-socratic-exploration)
+  - [Safe Undo](#95-safe-undo)
+  - [Plan Import](#96-plan-import)
+  - [Rapid Codebase Scan](#97-rapid-codebase-scan)
+  - [Autonomous Audit-to-Fix](#98-autonomous-audit-to-fix)
+  - [Improved Prompt Injection Scanner](#99-improved-prompt-injection-scanner)
+  - [Stall Detection in Plan-Phase](#100-stall-detection-in-plan-phase)
+  - [Hard Stop Safety Gates in /gsd-next](#101-hard-stop-safety-gates-in-gsd-next)
+  - [Adaptive Model Preset](#102-adaptive-model-preset)
+  - [Post-Merge Hunk Verification](#103-post-merge-hunk-verification)
 - [v1.32 Features](#v132-features)
  - [STATE.md Consistency Gates](#69-statemd-consistency-gates)
  - [Autonomous `--to N` Flag](#70-autonomous---to-n-flag)
@@ -1894,3 +1910,272 @@ Test suite that scans all agent, workflow, and command files for embedded inject
 | Setting | Type | Default | Description |
 |---------|------|---------|-------------|
 | `hooks.community` | boolean | `false` | Enable optional community hooks for commit validation, session state, and phase boundaries |
+
+---
+
+## v1.34.0 Features
+
+  - [Global Learnings Store](#89-global-learnings-store)
+  - [Queryable Codebase Intelligence](#90-queryable-codebase-intelligence)
+  - [Execution Context Profiles](#91-execution-context-profiles)
+  - [Gates Taxonomy](#92-gates-taxonomy)
+  - [Code Review Pipeline](#93-code-review-pipeline)
+  - [Socratic Exploration](#94-socratic-exploration)
+  - [Safe Undo](#95-safe-undo)
+  - [Plan Import](#96-plan-import)
+  - [Rapid Codebase Scan](#97-rapid-codebase-scan)
+  - [Autonomous Audit-to-Fix](#98-autonomous-audit-to-fix)
+  - [Improved Prompt Injection Scanner](#99-improved-prompt-injection-scanner)
+  - [Stall Detection in Plan-Phase](#100-stall-detection-in-plan-phase)
+  - [Hard Stop Safety Gates in /gsd-next](#101-hard-stop-safety-gates-in-gsd-next)
+  - [Adaptive Model Preset](#102-adaptive-model-preset)
+  - [Post-Merge Hunk Verification](#103-post-merge-hunk-verification)
+
+---
+
+### 89. Global Learnings Store
+
+**Commands:** Auto-triggered at phase completion; consumed by planner
+**Config:** `features.global_learnings`
+
+**Purpose:** Persist cross-session, cross-project learnings in a global store so the planner agent can learn from patterns across the entire project history — not just the current session.
+
+**Requirements:**
+- REQ-LEARN-01: Learnings MUST be auto-copied from `.planning/` to the global store at phase completion
+- REQ-LEARN-02: The planner agent MUST receive relevant learnings at spawn time via injection
+- REQ-LEARN-03: Injection MUST be capped by `learnings.max_inject` to avoid context bloat
+- REQ-LEARN-04: Feature MUST be opt-in via `features.global_learnings: true`
+
+**Config:**
+| Setting | Type | Default | Description |
+|---------|------|---------|-------------|
+| `features.global_learnings` | boolean | `false` | Enable cross-project learnings pipeline |
+| `learnings.max_inject` | number | (system default) | Maximum learnings entries injected into planner |
+
+---
+
+### 90. Queryable Codebase Intelligence
+
+**Command:** `/gsd-intel [query <term>|status|diff|refresh]`
+**Config:** `intel.enabled`
+
+**Purpose:** Maintain a queryable JSON index of codebase structure, API surface, dependency graph, file roles, and architecture decisions in `.planning/intel/`. Enables targeted lookups without reading the entire codebase.
+
+**Requirements:**
+- REQ-INTEL-01: Intel files MUST be stored as JSON in `.planning/intel/`
+- REQ-INTEL-02: `query` mode MUST search across all intel files for a term and group results by file
+- REQ-INTEL-03: `status` mode MUST report freshness (FRESH/STALE, stale threshold: 24 hours)
+- REQ-INTEL-04: `diff` mode MUST compare current intel state to the last snapshot
+- REQ-INTEL-05: `refresh` mode MUST spawn the intel-updater agent to rebuild all files
+- REQ-INTEL-06: Feature MUST be opt-in via `intel.enabled: true`
+
+**Intel files produced:**
+| File | Contents |
+|------|----------|
+| `stack.json` | Technology stack and dependencies |
+| `api-map.json` | Exported functions and API surface |
+| `dependency-graph.json` | Inter-module dependency relationships |
+| `file-roles.json` | Role classification for each source file |
+| `arch-decisions.json` | Detected architecture decisions |
+
+---
+
+### 91. Execution Context Profiles
+
+**Config:** `context_profile`
+
+**Purpose:** Select a pre-configured execution context (mode, model, workflow settings) tuned for a specific type of work without manually adjusting individual settings.
+
+**Requirements:**
+- REQ-CTX-01: `dev` profile MUST optimize for iterative development (balanced model, plan_check enabled)
+- REQ-CTX-02: `research` profile MUST optimize for research-heavy work (higher model tier, research enabled)
+- REQ-CTX-03: `review` profile MUST optimize for code review work (verifier and code_review enabled)
+
+**Available profiles:** `dev`, `research`, `review`
+
+**Config:**
+| Setting | Type | Default | Description |
+|---------|------|---------|-------------|
+| `context_profile` | string | (none) | Execution context preset: `dev`, `research`, or `review` |
+
+---
+
+### 92. Gates Taxonomy
+
+**References:** `get-shit-done/references/gates.md`
+**Agents:** plan-checker, verifier
+
+**Purpose:** Define 4 canonical gate types that structure all workflow decision points, enabling plan-checker and verifier agents to apply consistent gate logic.
+
+**Gate types:**
+| Type | Description |
+|------|-------------|
+| **Confirm** | User approves before proceeding (e.g., roadmap review) |
+| **Quality** | Automated quality check must pass (e.g., plan verification loop) |
+| **Safety** | Hard stop on detected risk or policy violation |
+| **Transition** | Phase or milestone boundary acknowledgment |
+
+**Requirements:**
+- REQ-GATES-01: plan-checker MUST classify each checkpoint as one of the 4 gate types
+- REQ-GATES-02: verifier MUST apply gate logic appropriate to the gate type
+- REQ-GATES-03: Hard stop safety gates MUST never be bypassed by `--auto` flags
+
+---
+
+### 93. Code Review Pipeline
+
+**Commands:** `/gsd-code-review`, `/gsd-code-review-fix`
+
+**Purpose:** Structured review of source files changed during a phase, with a separate auto-fix pass that commits each fix atomically.
+
+**Requirements:**
+- REQ-REVIEW-01: `gsd-code-review` MUST scope files to the phase using SUMMARY.md and git diff fallback
+- REQ-REVIEW-02: Review MUST support three depth levels: `quick`, `standard`, `deep`
+- REQ-REVIEW-03: Findings MUST be severity-classified: Critical, Warning, Info
+- REQ-REVIEW-04: `gsd-code-review-fix` MUST read REVIEW.md and fix Critical + Warning findings by default
+- REQ-REVIEW-05: Each fix MUST be committed atomically with a descriptive message
+- REQ-REVIEW-06: `--auto` flag MUST enable fix + re-review iteration loop, capped at 3 iterations
+- REQ-REVIEW-07: Feature MUST be gated by `workflow.code_review` config flag
+
+**Config:**
+| Setting | Type | Default | Description |
+|---------|------|---------|-------------|
+| `workflow.code_review` | boolean | `true` | Enable code review commands |
+| `workflow.code_review_depth` | string | `standard` | Default review depth: `quick`, `standard`, or `deep` |
+
+---
+
+### 94. Socratic Exploration
+
+**Command:** `/gsd-explore [topic]`
+
+**Purpose:** Guide a developer through exploring an idea via Socratic probing questions before committing to a plan. Routes outputs to the appropriate GSD artifact: notes, todos, seeds, research questions, requirements updates, or a new phase.
+
+**Requirements:**
+- REQ-EXPLORE-01: Exploration MUST use Socratic probing — ask questions before proposing solutions
+- REQ-EXPLORE-02: Session MUST offer to route outputs to the appropriate GSD artifact
+- REQ-EXPLORE-03: An optional topic argument MUST prime the first question
+- REQ-EXPLORE-04: Exploration MUST optionally spawn a research agent for technical feasibility
+
+---
+
+### 95. Safe Undo
+
+**Command:** `/gsd-undo --last N | --phase NN | --plan NN-MM`
+
+**Purpose:** Roll back GSD phase or plan commits safely using the phase manifest and git log, with dependency checks and a hard confirmation gate before any revert is applied.
+
+**Requirements:**
+- REQ-UNDO-01: `--phase` mode MUST identify all commits for the phase via manifest and git log fallback
+- REQ-UNDO-02: `--plan` mode MUST identify all commits for a specific plan
+- REQ-UNDO-03: `--last N` mode MUST display recent GSD commits for interactive selection
+- REQ-UNDO-04: System MUST check for dependent phases/plans before reverting
+- REQ-UNDO-05: A confirmation gate MUST be shown before any git revert is executed
+
+---
+
+### 96. Plan Import
+
+**Command:** `/gsd-import --from <filepath>`
+
+**Purpose:** Ingest an external plan file into the GSD planning system with conflict detection against `PROJECT.md` decisions, converting it to a valid GSD PLAN.md and validating it through the plan-checker.
+
+**Requirements:**
+- REQ-IMPORT-01: Importer MUST detect conflicts between the external plan and existing PROJECT.md decisions
+- REQ-IMPORT-02: All detected conflicts MUST be presented to the user for resolution before writing
+- REQ-IMPORT-03: Imported plan MUST be written as a valid GSD PLAN.md format
+- REQ-IMPORT-04: Written plan MUST pass `gsd-plan-checker` validation
+
+---
+
+### 97. Rapid Codebase Scan
+
+**Command:** `/gsd-scan [--focus tech|arch|quality|concerns|tech+arch]`
+
+**Purpose:** Lightweight alternative to `/gsd-map-codebase` that spawns a single mapper agent for a specific focus area, producing targeted output in `.planning/codebase/` without the overhead of 4 parallel agents.
+
+**Requirements:**
+- REQ-SCAN-01: Scan MUST spawn exactly one mapper agent (not four parallel agents)
+- REQ-SCAN-02: Focus area MUST be one of: `tech`, `arch`, `quality`, `concerns`, `tech+arch` (default)
+- REQ-SCAN-03: Output MUST be written to `.planning/codebase/` in the same format as `/gsd-map-codebase`
+
+---
+
+### 98. Autonomous Audit-to-Fix
+
+**Command:** `/gsd-audit-fix [--source <audit>] [--severity high|medium|all] [--max N] [--dry-run]`
+
+**Purpose:** End-to-end pipeline that runs an audit, classifies findings as auto-fixable vs. manual-only, then autonomously fixes auto-fixable issues with test verification and atomic commits.
+
+**Requirements:**
+- REQ-AUDITFIX-01: Findings MUST be classified as auto-fixable or manual-only before any changes
+- REQ-AUDITFIX-02: Each fix MUST be verified with tests before committing
+- REQ-AUDITFIX-03: Each fix MUST be committed atomically
+- REQ-AUDITFIX-04: `--dry-run` MUST show classification table without applying any fixes
+- REQ-AUDITFIX-05: `--max N` MUST limit the number of fixes applied in one run (default: 5)
+
+---
+
+### 99. Improved Prompt Injection Scanner
+
+**Hook:** `gsd-prompt-guard.js`
+**Script:** `scripts/prompt-injection-scan.sh`
+
+**Purpose:** Enhanced detection of prompt injection attempts in planning artifacts, adding invisible Unicode character detection, encoding obfuscation patterns, and entropy-based analysis.
+
+**Requirements:**
+- REQ-SCAN-INJ-01: Scanner MUST detect invisible Unicode characters (zero-width spaces, soft hyphens, etc.)
+- REQ-SCAN-INJ-02: Scanner MUST detect encoding obfuscation patterns (base64-encoded instructions, homoglyphs)
+- REQ-SCAN-INJ-03: Scanner MUST apply entropy analysis to flag high-entropy strings in unexpected positions
+- REQ-SCAN-INJ-04: Scanner MUST remain advisory-only — detection is logged, not blocking
+
+---
+
+### 100. Stall Detection in Plan-Phase
+
+**Command:** `/gsd-plan-phase`
+
+**Purpose:** Detect when the planner revision loop has stalled — producing the same output across multiple iterations — and break the cycle by escalating to a different strategy or exiting with a clear diagnostic.
+
+**Requirements:**
+- REQ-STALL-01: Revision loop MUST detect identical plan output across consecutive iterations
+- REQ-STALL-02: On stall detection, system MUST escalate strategy before retrying
+- REQ-STALL-03: Maximum stall retries MUST be bounded (capped at the existing max 3 iterations)
+
+---
+
+### 101. Hard Stop Safety Gates in /gsd-next
+
+**Command:** `/gsd-next`
+
+**Purpose:** Prevent `/gsd-next` from entering runaway loops by adding hard stop safety gates and a consecutive-call guard that interrupts autonomous chaining when repeated identical steps are detected.
+
+**Requirements:**
+- REQ-NEXT-GATE-01: `/gsd-next` MUST track consecutive same-step calls
+- REQ-NEXT-GATE-02: On repeated same-step, system MUST present a hard stop gate to the user
+- REQ-NEXT-GATE-03: User MUST explicitly confirm to continue past a hard stop gate
+
+---
+
+### 102. Adaptive Model Preset
+
+**Config:** `model_profile: "adaptive"`
+
+**Purpose:** Role-based model assignment that automatically selects the appropriate model tier based on the current agent's role, rather than applying a single tier to all agents.
+
+**Requirements:**
+- REQ-ADAPTIVE-01: `adaptive` preset MUST assign model tiers based on agent role (planner → quality tier, executor → balanced tier, etc.)
+- REQ-ADAPTIVE-02: `adaptive` MUST be selectable via `/gsd-set-profile adaptive`
+
+---
+
+### 103. Post-Merge Hunk Verification
+
+**Command:** `/gsd-reapply-patches`
+
+**Purpose:** After applying local patches post-update, verify that all hunks were actually applied by comparing the expected patch content against the live filesystem. Surface any dropped or partial hunks immediately rather than silently accepting incomplete merges.
+
+**Requirements:**
+- REQ-PATCH-VERIFY-01: Reapply-patches MUST verify each hunk was applied after the merge
+- REQ-PATCH-VERIFY-02: Dropped or partial hunks MUST be reported to the user with file and line context
+- REQ-PATCH-VERIFY-03: Verification MUST run after all patches are applied, not per-patch
--- a/docs/USER-GUIDE.md
+++ b/docs/USER-GUIDE.md
@@ -387,6 +387,87 @@ The `security.cjs` module scans for known injection patterns (role overrides, in

 ---

+## Code Review Workflow
+
+### Phase Code Review
+
+After executing a phase, run a structured code review before UAT:
+
+```bash
+/gsd-code-review 3               # Review all changed files in phase 3
+/gsd-code-review 3 --depth=deep  # Deep cross-file review (import graphs, call chains)
+```
+
+The reviewer scopes files automatically using SUMMARY.md (preferred) or git diff fallback. Findings are classified as Critical, Warning, or Info in `{phase}-REVIEW.md`.
+
+```bash
+/gsd-code-review-fix 3           # Fix Critical + Warning findings atomically
+/gsd-code-review-fix 3 --auto    # Fix and re-review until clean (max 3 iterations)
+```
+
+### Autonomous Audit-to-Fix
+
+To run an audit and fix all auto-fixable issues in one pass:
+
+```bash
+/gsd-audit-fix                   # Audit + classify + fix (medium+ severity, max 5)
+/gsd-audit-fix --dry-run         # Preview classification without fixing
+```
+
+### Code Review in the Full Phase Lifecycle
+
+The review step slots in after execution and before UAT:
+
+```
+/gsd-execute-phase N   ->  /gsd-code-review N  ->  /gsd-code-review-fix N  ->  /gsd-verify-work N
+```
+
+---
+
+## Exploration & Discovery
+
+### Socratic Exploration
+
+Before committing to a new phase or plan, use `/gsd-explore` to think through the idea:
+
+```bash
+/gsd-explore                           # Open-ended ideation
+/gsd-explore "caching strategy"        # Explore a specific topic
+```
+
+The exploration session guides you through probing questions, optionally spawns a research agent, and routes output to the appropriate GSD artifact: note, todo, seed, research question, requirements update, or new phase.
+
+### Codebase Intelligence
+
+For queryable codebase insights without reading the entire codebase, enable the intel system:
+
+```json
+{ "intel": { "enabled": true } }
+```
+
+Then build the index:
+
+```bash
+/gsd-intel refresh             # Analyze codebase and write .planning/intel/ files
+/gsd-intel query auth          # Search for a term across all intel files
+/gsd-intel status              # Check freshness of intel files
+/gsd-intel diff                # See what changed since last snapshot
+```
+
+Intel files cover stack, API surface, dependency graph, file roles, and architecture decisions.
+
+### Quick Scan
+
+For a focused assessment without full `/gsd-map-codebase` overhead:
+
+```bash
+/gsd-scan                      # Quick tech + arch overview
+/gsd-scan --focus quality      # Quality and code health only
+/gsd-scan --focus concerns     # Risk areas and concerns
+```
+
+---
+
 ## Command Reference

 ### Core Workflow
@@ -436,9 +517,14 @@ The `security.cjs` module scans for known injection patterns (role overrides, in

 | Command | Purpose | When to Use |
 |---------|---------|-------------|
-| `/gsd-map-codebase` | Analyze existing codebase | Before `/gsd-new-project` on existing code |
+| `/gsd-map-codebase` | Analyze existing codebase (4 parallel agents) | Before `/gsd-new-project` on existing code |
+| `/gsd-scan [--focus area]` | Rapid single-focus codebase scan (1 agent) | Quick assessment of a specific area |
+| `/gsd-intel [query\|status\|diff\|refresh]` | Query codebase intelligence index | Look up APIs, deps, or architecture decisions |
+| `/gsd-explore [topic]` | Socratic ideation — think through an idea before committing | Exploring unfamiliar solution space |
 | `/gsd-quick` | Ad-hoc task with GSD guarantees | Bug fixes, small features, config changes |
 | `/gsd-autonomous` | Run remaining phases autonomously (`--from N`, `--to N`) | Hands-free multi-phase execution |
+| `/gsd-undo --last N\|--phase NN\|--plan NN-MM` | Safe git revert using phase manifest | Roll back a bad execution |
+| `/gsd-import --from <file>` | Ingest external plan with conflict detection | Import plans from teammates or other tools |
 | `/gsd-debug [desc]` | Systematic debugging with persistent state (`--diagnose` for no-fix mode) | When something breaks |
 | `/gsd-forensics` | Diagnostic report for workflow failures | When state, artifacts, or git history seem corrupted |
 | `/gsd-add-todo [desc]` | Capture an idea for later | Think of something during a session |
@@ -452,6 +538,9 @@ The `security.cjs` module scans for known injection patterns (role overrides, in
 | Command | Purpose | When to Use |
 |---------|---------|-------------|
 | `/gsd-review --phase N` | Cross-AI peer review from external CLIs | Before executing, to validate plans |
+| `/gsd-code-review <N>` | Review source files changed in a phase for bugs and security issues | After execution, before verification |
+| `/gsd-code-review-fix <N>` | Auto-fix issues found by `/gsd-code-review` | After code review produces REVIEW.md |
+| `/gsd-audit-fix` | Autonomous audit-to-fix pipeline with classification and atomic commits | After UAT surfaces fixable issues |
 | `/gsd-pr-branch` | Clean PR branch filtering `.planning/` commits | Before creating PR with planning-free diff |
 | `/gsd-audit-uat` | Audit verification debt across all phases | Before milestone completion |

--- a/get-shit-done/bin/gsd-tools.cjs
+++ b/get-shit-done/bin/gsd-tools.cjs
@@ -294,6 +294,17 @@ async function main() {
    args.splice(pickIdx, 2);
  }

+  // --default <value>: for config-get, return this value instead of erroring
+  // when the key is absent. Allows workflows to express optional config reads
+  // without defensive `2>/dev/null || true` boilerplate (#1893).
+  const defaultIdx = args.indexOf('--default');
+  let defaultValue = undefined;
+  if (defaultIdx !== -1) {
+    defaultValue = args[defaultIdx + 1];
+    if (defaultValue === undefined) defaultValue = '';
+    args.splice(defaultIdx, 2);
+  }
+
  const command = args[0];

  if (!command) {
@@ -346,7 +357,7 @@ async function main() {
      }
    };
    try {
-      await runCommand(command, args, cwd, raw);
+      await runCommand(command, args, cwd, raw, defaultValue);
      cleanup();
    } catch (e) {
      fs.writeSync = origWriteSync;
@@ -355,7 +366,27 @@ async function main() {
    return;
  }

-  await runCommand(command, args, cwd, raw);
+  // Intercept stdout to transparently resolve @file: references (#1891).
+  // core.cjs output() writes @file:<path> when JSON > 50KB. The --pick path
+  // already resolves this, but the normal path wrote @file: to stdout, forcing
+  // every workflow to have a bash-specific `if [[ "$INIT" == @file:* ]]` check
+  // that breaks on PowerShell and other non-bash shells.
+  const origWriteSync2 = fs.writeSync;
+  const outChunks = [];
+  fs.writeSync = function (fd, data, ...rest) {
+    if (fd === 1) { outChunks.push(String(data)); return; }
+    return origWriteSync2.call(fs, fd, data, ...rest);
+  };
+  try {
+    await runCommand(command, args, cwd, raw, defaultValue);
+  } finally {
+    fs.writeSync = origWriteSync2;
+  }
+  let captured = outChunks.join('');
+  if (captured.startsWith('@file:')) {
+    captured = fs.readFileSync(captured.slice(6), 'utf-8');
+  }
+  origWriteSync2.call(fs, 1, captured);
 }

 /**
@@ -381,7 +412,7 @@ function extractField(obj, fieldPath) {
  return current;
 }

-async function runCommand(command, args, cwd, raw) {
+async function runCommand(command, args, cwd, raw, defaultValue) {
  switch (command) {
    case 'state': {
      const subcommand = args[1];
@@ -589,7 +620,7 @@ async function runCommand(command, args, cwd, raw) {
    }

    case 'config-get': {
-      config.cmdConfigGet(cwd, args[1], raw);
+      config.cmdConfigGet(cwd, args[1], raw, defaultValue);
      break;
    }

--- a/get-shit-done/bin/lib/commands.cjs
+++ b/get-shit-done/bin/lib/commands.cjs
@@ -313,11 +313,19 @@ function cmdCommit(cwd, message, files, raw, amend, noVerify) {
  }

  // Stage files
-  const filesToStage = files && files.length > 0 ? files : ['.planning/'];
+  const explicitFiles = files && files.length > 0;
+  const filesToStage = explicitFiles ? files : ['.planning/'];
  for (const file of filesToStage) {
    const fullPath = path.join(cwd, file);
    if (!fs.existsSync(fullPath)) {
-      // File was deleted/moved — stage the deletion
+      if (explicitFiles) {
+        // Caller passed an explicit --files list: missing files are skipped.
+        // Staging a deletion here would silently remove tracked planning files
+        // (e.g. STATE.md, ROADMAP.md) when they are temporarily absent (#2014).
+        continue;
+      }
+      // Default mode (staging all of .planning/): stage the deletion so
+      // removed planning files are not left dangling in the index.
      execGit(cwd, ['rm', '--cached', '--ignore-unmatch', file]);
    } else {
      execGit(cwd, ['add', file]);
--- a/get-shit-done/bin/lib/config.cjs
+++ b/get-shit-done/bin/lib/config.cjs
@@ -4,7 +4,7 @@

 const fs = require('fs');
 const path = require('path');
-const { output, error, planningRoot, CONFIG_DEFAULTS } = require('./core.cjs');
+const { output, error, planningDir, withPlanningLock, CONFIG_DEFAULTS, atomicWriteFileSync } = require('./core.cjs');
 const {
  VALID_PROFILES,
  getAgentToModelMapForProfile,
@@ -15,7 +15,7 @@ const VALID_CONFIG_KEYS = new Set([
  'mode', 'granularity', 'parallelization', 'commit_docs', 'model_profile',
  'search_gitignored', 'brave_search', 'firecrawl', 'exa_search',
  'workflow.research', 'workflow.plan_check', 'workflow.verifier',
-  'workflow.nyquist_validation', 'workflow.ui_phase', 'workflow.ui_safety_gate',
+  'workflow.nyquist_validation', 'workflow.ai_integration_phase', 'workflow.ui_phase', 'workflow.ui_safety_gate',
  'workflow.auto_advance', 'workflow.node_repair', 'workflow.node_repair_budget',
  'workflow.text_mode',
  'workflow.research_before_questions',
@@ -47,6 +47,8 @@ function isValidConfigKey(keyPath) {
  if (VALID_CONFIG_KEYS.has(keyPath)) return true;
  // Allow agent_skills.<agent-type> with any agent type string
  if (/^agent_skills\.[a-zA-Z0-9_-]+$/.test(keyPath)) return true;
+  // Allow review.models.<cli-name> for per-CLI model selection in /gsd-review
+  if (/^review\.models\.[a-zA-Z0-9_-]+$/.test(keyPath)) return true;
  // Allow features.<feature_name> — dynamic namespace for feature flags.
  // Intentionally open-ended so new flags (e.g., features.global_learnings) work
  // without updating VALID_CONFIG_KEYS each time.
@@ -64,6 +66,7 @@ const CONFIG_KEY_SUGGESTIONS = {
  'workflow.review': 'workflow.code_review',
  'workflow.code_review_level': 'workflow.code_review_depth',
  'workflow.review_depth': 'workflow.code_review_depth',
+  'review.model': 'review.models.<cli-name>',
 };

 function validateKnownConfigKeyPath(keyPath) {
@@ -143,6 +146,7 @@ function buildNewProjectConfig(userChoices) {
      node_repair_budget: 2,
      ui_phase: true,
      ui_safety_gate: true,
+      ai_integration_phase: true,
      text_mode: false,
      research_before_questions: false,
      discuss_mode: 'discuss',
@@ -196,7 +200,7 @@ function buildNewProjectConfig(userChoices) {
 * Idempotent: if config.json already exists, returns { created: false }.
 */
 function cmdConfigNewProject(cwd, choicesJson, raw) {
-  const planningBase = planningRoot(cwd);
+  const planningBase = planningDir(cwd);
  const configPath = path.join(planningBase, 'config.json');

  // Idempotent: don't overwrite existing config
@@ -227,7 +231,7 @@ function cmdConfigNewProject(cwd, choicesJson, raw) {
  const config = buildNewProjectConfig(userChoices);

  try {
-    fs.writeFileSync(configPath, JSON.stringify(config, null, 2), 'utf-8');
+    atomicWriteFileSync(configPath, JSON.stringify(config, null, 2), 'utf-8');
    output({ created: true, path: '.planning/config.json' }, raw, 'created');
  } catch (err) {
    error('Failed to write config.json: ' + err.message);
@@ -241,7 +245,7 @@ function cmdConfigNewProject(cwd, choicesJson, raw) {
 * the happy path. But note that `error()` will still `exit(1)` out of the process.
 */
 function ensureConfigFile(cwd) {
-  const planningBase = planningRoot(cwd);
+  const planningBase = planningDir(cwd);
  const configPath = path.join(planningBase, 'config.json');

  // Ensure .planning directory exists
@@ -261,7 +265,7 @@ function ensureConfigFile(cwd) {
  const config = buildNewProjectConfig({});

  try {
-    fs.writeFileSync(configPath, JSON.stringify(config, null, 2), 'utf-8');
+    atomicWriteFileSync(configPath, JSON.stringify(config, null, 2), 'utf-8');
    return { created: true, path: '.planning/config.json' };
  } catch (err) {
    error('Failed to create config.json: ' + err.message);
@@ -291,38 +295,40 @@ function cmdConfigEnsureSection(cwd, raw) {
 * the happy path. But note that `error()` will still `exit(1)` out of the process.
 */
 function setConfigValue(cwd, keyPath, parsedValue) {
-  const configPath = path.join(planningRoot(cwd), 'config.json');
+  const configPath = path.join(planningDir(cwd), 'config.json');

-  // Load existing config or start with empty object
-  let config = {};
-  try {
-    if (fs.existsSync(configPath)) {
-      config = JSON.parse(fs.readFileSync(configPath, 'utf-8'));
+  return withPlanningLock(cwd, () => {
+    // Load existing config or start with empty object
+    let config = {};
+    try {
+      if (fs.existsSync(configPath)) {
+        config = JSON.parse(fs.readFileSync(configPath, 'utf-8'));
+      }
+    } catch (err) {
+      error('Failed to read config.json: ' + err.message);
    }
-  } catch (err) {
-    error('Failed to read config.json: ' + err.message);
-  }

-  // Set nested value using dot notation (e.g., "workflow.research")
-  const keys = keyPath.split('.');
-  let current = config;
-  for (let i = 0; i < keys.length - 1; i++) {
-    const key = keys[i];
-    if (current[key] === undefined || typeof current[key] !== 'object') {
-      current[key] = {};
+    // Set nested value using dot notation (e.g., "workflow.research")
+    const keys = keyPath.split('.');
+    let current = config;
+    for (let i = 0; i < keys.length - 1; i++) {
+      const key = keys[i];
+      if (current[key] === undefined || typeof current[key] !== 'object') {
+        current[key] = {};
+      }
+      current = current[key];
    }
-    current = current[key];
-  }
-  const previousValue = current[keys[keys.length - 1]]; // Capture previous value before overwriting
-  current[keys[keys.length - 1]] = parsedValue;
+    const previousValue = current[keys[keys.length - 1]]; // Capture previous value before overwriting
+    current[keys[keys.length - 1]] = parsedValue;

-  // Write back
-  try {
-    fs.writeFileSync(configPath, JSON.stringify(config, null, 2), 'utf-8');
-    return { updated: true, key: keyPath, value: parsedValue, previousValue };
-  } catch (err) {
-    error('Failed to write config.json: ' + err.message);
-  }
+    // Write back
+    try {
+      atomicWriteFileSync(configPath, JSON.stringify(config, null, 2), 'utf-8');
+      return { updated: true, key: keyPath, value: parsedValue, previousValue };
+    } catch (err) {
+      error('Failed to write config.json: ' + err.message);
+    }
+  });
 }

 /**
@@ -361,17 +367,21 @@ function cmdConfigSet(cwd, keyPath, value, raw) {
  output(setConfigValueResult, raw, `${keyPath}=${parsedValue}`);
 }

-function cmdConfigGet(cwd, keyPath, raw) {
-  const configPath = path.join(planningRoot(cwd), 'config.json');
+function cmdConfigGet(cwd, keyPath, raw, defaultValue) {
+  const configPath = path.join(planningDir(cwd), 'config.json');
+  const hasDefault = defaultValue !== undefined;

  if (!keyPath) {
-    error('Usage: config-get <key.path>');
+    error('Usage: config-get <key.path> [--default <value>]');
  }

  let config = {};
  try {
    if (fs.existsSync(configPath)) {
      config = JSON.parse(fs.readFileSync(configPath, 'utf-8'));
+    } else if (hasDefault) {
+      output(defaultValue, raw, String(defaultValue));
+      return;
    } else {
      error('No config.json found at ' + configPath);
    }
@@ -385,12 +395,14 @@ function cmdConfigGet(cwd, keyPath, raw) {
  let current = config;
  for (const key of keys) {
    if (current === undefined || current === null || typeof current !== 'object') {
+      if (hasDefault) { output(defaultValue, raw, String(defaultValue)); return; }
      error(`Key not found: ${keyPath}`);
    }
    current = current[key];
  }

  if (current === undefined) {
+    if (hasDefault) { output(defaultValue, raw, String(defaultValue)); return; }
    error(`Key not found: ${keyPath}`);
  }

--- a/get-shit-done/bin/lib/core.cjs
+++ b/get-shit-done/bin/lib/core.cjs
@@ -27,6 +27,16 @@ const WORKSTREAM_SESSION_ENV_KEYS = [
 let cachedControllingTtyToken = null;
 let didProbeControllingTtyToken = false;

+// Track all .planning/.lock files held by this process so they can be removed
+// on exit. process.on('exit') fires even on process.exit(1), unlike try/finally
+// which is skipped when error() calls process.exit(1) inside a locked region (#1916).
+const _heldPlanningLocks = new Set();
+process.on('exit', () => {
+  for (const lockPath of _heldPlanningLocks) {
+    try { fs.unlinkSync(lockPath); } catch { /* already gone */ }
+  }
+});
+
 // ─── Path helpers ────────────────────────────────────────────────────────────

 /** Normalize a relative path to always use forward slashes (cross-platform). */
@@ -229,6 +239,7 @@ const CONFIG_DEFAULTS = {
  plan_checker: true,
  verifier: true,
  nyquist_validation: true,
+  ai_integration_phase: true,
  parallelization: true,
  brave_search: false,
  firecrawl: false,
@@ -400,7 +411,11 @@ function loadConfig(cwd) {

 // ─── Git utilities ────────────────────────────────────────────────────────────

+const _gitIgnoredCache = new Map();
+
 function isGitIgnored(cwd, targetPath) {
+  const key = cwd + '::' + targetPath;
+  if (_gitIgnoredCache.has(key)) return _gitIgnoredCache.get(key);
  try {
    // --no-index checks .gitignore rules regardless of whether the file is tracked.
    // Without it, git check-ignore returns "not ignored" for tracked files even when
@@ -412,8 +427,10 @@ function isGitIgnored(cwd, targetPath) {
      cwd,
      stdio: 'pipe',
    });
+    _gitIgnoredCache.set(key, true);
    return true;
  } catch {
+    _gitIgnoredCache.set(key, false);
    return false;
  }
 }
@@ -598,10 +615,15 @@ function withPlanningLock(cwd, fn) {
        acquired: new Date().toISOString(),
      }), { flag: 'wx' });

+      // Register for exit-time cleanup so process.exit(1) inside a locked region
+      // cannot leave a stale lock file (#1916).
+      _heldPlanningLocks.add(lockPath);
+
      // Lock acquired — run the function
      try {
        return fn();
      } finally {
+        _heldPlanningLocks.delete(lockPath);
        try { fs.unlinkSync(lockPath); } catch { /* already released */ }
      }
    } catch (err) {
@@ -670,19 +692,23 @@ function planningRoot(cwd) {
 }

 /**
- * Get common .planning file paths, workstream-aware.
- * Scoped paths (state, roadmap, phases, requirements) resolve to the active workstream.
- * Shared paths (project, config) always resolve to the root .planning/.
+ * Get common .planning file paths, project-and-workstream-aware.
+ *
+ * All paths route through planningDir(cwd, ws), which honors the GSD_PROJECT
+ * env var and active workstream. This matches loadConfig() above (line 256),
+ * which has always read config.json via planningDir(cwd). Previously project
+ * and config were resolved against the unrouted .planning/ root, which broke
+ * `gsd-tools config-get` in multi-project layouts (the CRUD writers and the
+ * reader pointed at different files).
 */
 function planningPaths(cwd, ws) {
  const base = planningDir(cwd, ws);
-  const root = path.join(cwd, '.planning');
  return {
    planning: base,
    state: path.join(base, 'STATE.md'),
    roadmap: path.join(base, 'ROADMAP.md'),
-    project: path.join(root, 'PROJECT.md'),
-    config: path.join(root, 'config.json'),
+    project: path.join(base, 'PROJECT.md'),
+    config: path.join(base, 'config.json'),
    phases: path.join(base, 'phases'),
    requirements: path.join(base, 'REQUIREMENTS.md'),
  };
@@ -879,7 +905,10 @@ function normalizePhaseName(phase) {
  const match = stripped.match(/^(\d+)([A-Z])?((?:\.\d+)*)/i);
  if (match) {
    const padded = match[1].padStart(2, '0');
-    const letter = match[2] ? match[2].toUpperCase() : '';
+    // Preserve original case of letter suffix (#1962).
+    // Uppercasing causes directory/roadmap mismatches on case-sensitive filesystems
+    // (e.g., "16c" in ROADMAP.md → directory "16C-name" → progress can't match).
+    const letter = match[2] || '';
    const decimal = match[3] || '';
    return padded + letter + decimal;
  }
@@ -1485,6 +1514,38 @@ function readSubdirectories(dirPath, sort = false) {
  }
 }

+// ─── Atomic file writes ───────────────────────────────────────────────────────
+
+/**
+ * Write a file atomically using write-to-temp-then-rename.
+ *
+ * On POSIX systems, `fs.renameSync` is atomic when the source and destination
+ * are on the same filesystem. This prevents a process killed mid-write from
+ * leaving a truncated file that is unparseable on next read.
+ *
+ * The temp file is placed alongside the target so it is guaranteed to be on
+ * the same filesystem (required for rename atomicity). The PID is embedded in
+ * the temp file name so concurrent writers use distinct paths.
+ *
+ * If `renameSync` fails (e.g. cross-device move), the function falls back to a
+ * direct `writeFileSync` so callers always get a best-effort write.
+ *
+ * @param {string} filePath  Absolute path to write.
+ * @param {string|Buffer} content  File content.
+ * @param {string} [encoding='utf-8']  Encoding passed to writeFileSync.
+ */
+function atomicWriteFileSync(filePath, content, encoding = 'utf-8') {
+  const tmpPath = filePath + '.tmp.' + process.pid;
+  try {
+    fs.writeFileSync(tmpPath, content, encoding);
+    fs.renameSync(tmpPath, filePath);
+  } catch (renameErr) {
+    // Clean up the temp file if rename failed, then fall back to direct write.
+    try { fs.unlinkSync(tmpPath); } catch { /* already gone or never created */ }
+    fs.writeFileSync(filePath, content, encoding);
+  }
+}
+
 module.exports = {
  output,
  error,
@@ -1530,4 +1591,5 @@ module.exports = {
  readSubdirectories,
  getAgentsDir,
  checkAgentsInstalled,
+  atomicWriteFileSync,
 };
--- a/get-shit-done/bin/lib/init.cjs
+++ b/get-shit-done/bin/lib/init.cjs
@@ -870,6 +870,23 @@ function cmdInitManager(cwd, raw) {
  const phasesDir = paths.phases;
  const isDirInMilestone = getMilestonePhaseFilter(cwd);

+  // Pre-compute directory listing once (avoids O(N) readdirSync per phase)
+  const _phaseDirEntries = (() => {
+    try {
+      return fs.readdirSync(phasesDir, { withFileTypes: true })
+        .filter(e => e.isDirectory())
+        .map(e => e.name);
+    } catch { return []; }
+  })();
+
+  // Pre-extract all checkbox states in a single pass (avoids O(N) regex per phase)
+  const _checkboxStates = new Map();
+  const _cbPattern = /-\s*\[(x| )\]\s*.*Phase\s+(\d+[A-Z]?(?:\.\d+)*)[:\s]/gi;
+  let _cbMatch;
+  while ((_cbMatch = _cbPattern.exec(content)) !== null) {
+    _checkboxStates.set(_cbMatch[2], _cbMatch[1].toLowerCase() === 'x');
+  }
+
  const phasePattern = /#{2,4}\s*Phase\s+(\d+[A-Z]?(?:\.\d+)*)\s*:\s*([^\n]+)/gi;
  const phases = [];
  let match;
@@ -900,8 +917,7 @@ function cmdInitManager(cwd, raw) {
    let isActive = false;

    try {
-      const entries = fs.readdirSync(phasesDir, { withFileTypes: true });
-      const dirs = entries.filter(e => e.isDirectory()).map(e => e.name).filter(isDirInMilestone);
+      const dirs = _phaseDirEntries.filter(isDirInMilestone);
      const dirMatch = dirs.find(d => phaseTokenMatches(d, normalized));

      if (dirMatch) {
@@ -935,10 +951,8 @@ function cmdInitManager(cwd, raw) {
      }
    } catch { /* intentionally empty */ }

-    // Check ROADMAP checkbox status
-    const checkboxPattern = new RegExp(`-\\s*\\[(x| )\\]\\s*.*Phase\\s+${phaseNum.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}[:\\s]`, 'i');
-    const checkboxMatch = content.match(checkboxPattern);
-    const roadmapComplete = checkboxMatch ? checkboxMatch[1] === 'x' : false;
+    // Check ROADMAP checkbox status (pre-extracted above the loop)
+    const roadmapComplete = _checkboxStates.get(phaseNum) || false;
    if (roadmapComplete && diskStatus !== 'complete') {
      diskStatus = 'complete';
    }
--- a/get-shit-done/bin/lib/milestone.cjs
+++ b/get-shit-done/bin/lib/milestone.cjs
@@ -41,29 +41,30 @@ function cmdRequirementsMarkComplete(cwd, reqIdsRaw, raw) {
    const reqEscaped = escapeRegex(reqId);

    // Update checkbox: - [ ] **REQ-ID** → - [x] **REQ-ID**
+    // Use replace() directly and compare — avoids test()+replace() global regex
+    // lastIndex bug where test() advances state and replace() misses matches.
    const checkboxPattern = new RegExp(`(-\\s*\\[)[ ](\\]\\s*\\*\\*${reqEscaped}\\*\\*)`, 'gi');
-    if (checkboxPattern.test(reqContent)) {
-      reqContent = reqContent.replace(checkboxPattern, '$1x$2');
+    const afterCheckbox = reqContent.replace(checkboxPattern, '$1x$2');
+    if (afterCheckbox !== reqContent) {
+      reqContent = afterCheckbox;
      found = true;
    }

    // Update traceability table: | REQ-ID | Phase N | Pending | → | REQ-ID | Phase N | Complete |
    const tablePattern = new RegExp(`(\\|\\s*${reqEscaped}\\s*\\|[^|]+\\|)\\s*Pending\\s*(\\|)`, 'gi');
-    if (tablePattern.test(reqContent)) {
-      // Re-read since test() advances lastIndex for global regex
-      reqContent = reqContent.replace(
-        new RegExp(`(\\|\\s*${reqEscaped}\\s*\\|[^|]+\\|)\\s*Pending\\s*(\\|)`, 'gi'),
-        '$1 Complete $2'
-      );
+    const afterTable = reqContent.replace(tablePattern, '$1 Complete $2');
+    if (afterTable !== reqContent) {
+      reqContent = afterTable;
      found = true;
    }

    if (found) {
      updated.push(reqId);
    } else {
-      // Check if already complete before declaring not_found
-      const doneCheckbox = new RegExp(`-\\s*\\[x\\]\\s*\\*\\*${reqEscaped}\\*\\*`, 'gi');
-      const doneTable = new RegExp(`\\|\\s*${reqEscaped}\\s*\\|[^|]+\\|\\s*Complete\\s*\\|`, 'gi');
+      // Check if already complete before declaring not_found.
+      // Non-global flag is fine here — we only need to know if a match exists.
+      const doneCheckbox = new RegExp(`-\\s*\\[x\\]\\s*\\*\\*${reqEscaped}\\*\\*`, 'i');
+      const doneTable = new RegExp(`\\|\\s*${reqEscaped}\\s*\\|[^|]+\\|\\s*Complete\\s*\\|`, 'i');
      if (doneCheckbox.test(reqContent) || doneTable.test(reqContent)) {
        alreadyComplete.push(reqId);
      } else {
@@ -253,7 +254,7 @@ function cmdPhasesClear(cwd, raw, args) {

  if (fs.existsSync(phasesDir)) {
    const entries = fs.readdirSync(phasesDir, { withFileTypes: true });
-    const dirs = entries.filter(e => e.isDirectory());
+    const dirs = entries.filter(e => e.isDirectory() && !/^999(?:\.|$)/.test(e.name));

    if (dirs.length > 0 && !confirm) {
      error(
--- a/get-shit-done/bin/lib/phase.cjs
+++ b/get-shit-done/bin/lib/phase.cjs
@@ -6,7 +6,7 @@ const fs = require('fs');
 const path = require('path');
 const { escapeRegex, loadConfig, normalizePhaseName, comparePhaseNum, findPhaseInternal, getArchivedPhaseDirs, generateSlugInternal, getMilestonePhaseFilter, stripShippedMilestones, extractCurrentMilestone, replaceInCurrentMilestone, toPosixPath, planningDir, withPlanningLock, output, error, readSubdirectories, phaseTokenMatches } = require('./core.cjs');
 const { extractFrontmatter } = require('./frontmatter.cjs');
-const { writeStateMd, stateExtractField, stateReplaceField, stateReplaceFieldWithFallback, updatePerformanceMetricsSection } = require('./state.cjs');
+const { writeStateMd, readModifyWriteStateMd, stateExtractField, stateReplaceField, stateReplaceFieldWithFallback, updatePerformanceMetricsSection } = require('./state.cjs');

 function cmdPhasesList(cwd, options, raw) {
  const phasesDir = path.join(planningDir(cwd), 'phases');
@@ -88,50 +88,49 @@ function cmdPhaseNextDecimal(cwd, basePhase, raw) {
  const phasesDir = path.join(planningDir(cwd), 'phases');
  const normalized = normalizePhaseName(basePhase);

-  // Check if phases directory exists
-  if (!fs.existsSync(phasesDir)) {
-    output(
-      {
-        found: false,
-        base_phase: normalized,
-        next: `${normalized}.1`,
-        existing: [],
-      },
-      raw,
-      `${normalized}.1`
-    );
-    return;
-  }
-
  try {
-    const entries = fs.readdirSync(phasesDir, { withFileTypes: true });
-    const dirs = entries.filter(e => e.isDirectory()).map(e => e.name);
+    let baseExists = false;
+    const decimalSet = new Set();

-    // Check if base phase exists
-    const baseExists = dirs.some(d => phaseTokenMatches(d, normalized));
+    // Scan directory names for existing decimal phases
+    if (fs.existsSync(phasesDir)) {
+      const entries = fs.readdirSync(phasesDir, { withFileTypes: true });
+      const dirs = entries.filter(e => e.isDirectory()).map(e => e.name);
+      baseExists = dirs.some(d => phaseTokenMatches(d, normalized));

-    // Find existing decimal phases for this base
-    const decimalPattern = new RegExp(`^${normalized}\\.(\\d+)`);
-    const existingDecimals = [];
-
-    for (const dir of dirs) {
-      const match = dir.match(decimalPattern);
-      if (match) {
-        existingDecimals.push(`${normalized}.${match[1]}`);
+      const dirPattern = new RegExp(`^(?:[A-Z]{1,6}-)?${escapeRegex(normalized)}\\.(\\d+)`);
+      for (const dir of dirs) {
+        const match = dir.match(dirPattern);
+        if (match) decimalSet.add(parseInt(match[1], 10));
      }
    }

-    // Sort numerically
-    existingDecimals.sort((a, b) => comparePhaseNum(a, b));
+    // Also scan ROADMAP.md for phase entries that may not have directories yet
+    const roadmapPath = path.join(planningDir(cwd), 'ROADMAP.md');
+    if (fs.existsSync(roadmapPath)) {
+      try {
+        const roadmapContent = fs.readFileSync(roadmapPath, 'utf-8');
+        const phasePattern = new RegExp(
+          `#{2,4}\\s*Phase\\s+0*${escapeRegex(normalized)}\\.(\\d+)\\s*:`, 'gi'
+        );
+        let pm;
+        while ((pm = phasePattern.exec(roadmapContent)) !== null) {
+          decimalSet.add(parseInt(pm[1], 10));
+        }
+      } catch { /* ROADMAP.md read failure is non-fatal */ }
+    }
+
+    // Build sorted list of existing decimals
+    const existingDecimals = Array.from(decimalSet)
+      .sort((a, b) => a - b)
+      .map(n => `${normalized}.${n}`);

    // Calculate next decimal
    let nextDecimal;
-    if (existingDecimals.length === 0) {
+    if (decimalSet.size === 0) {
      nextDecimal = `${normalized}.1`;
    } else {
-      const lastDecimal = existingDecimals[existingDecimals.length - 1];
-      const lastNum = parseInt(lastDecimal.split('.')[1], 10);
-      nextDecimal = `${normalized}.${lastNum + 1}`;
+      nextDecimal = `${normalized}.${Math.max(...decimalSet) + 1}`;
    }

    output(
@@ -341,15 +340,34 @@ function cmdPhaseAdd(cwd, description, raw, customId) {
      if (!_newPhaseId) error('--id required when phase_naming is "custom"');
      _dirName = `${prefix}${_newPhaseId}-${slug}`;
    } else {
-      // Sequential mode: find highest integer phase number (in current milestone only)
+      // Sequential mode: find highest integer phase number from two sources:
+      // 1. ROADMAP.md (current milestone only)
+      // 2. .planning/phases/ on disk (orphan directories not tracked in roadmap)
+      // Skip 999.x backlog phases — they live outside the active sequence
      const phasePattern = /#{2,4}\s*Phase\s+(\d+)[A-Z]?(?:\.\d+)*:/gi;
      let maxPhase = 0;
      let m;
      while ((m = phasePattern.exec(content)) !== null) {
        const num = parseInt(m[1], 10);
+        if (num >= 999) continue; // backlog phases use 999.x numbering
        if (num > maxPhase) maxPhase = num;
      }

+      // Also scan .planning/phases/ for orphan directories not tracked in ROADMAP.
+      // Directory names follow: [PREFIX-]NN-slug (e.g. 03-api or CK-05-old-feature).
+      // Strip the optional project_code prefix before extracting the leading integer.
+      const phasesOnDisk = path.join(planningDir(cwd), 'phases');
+      if (fs.existsSync(phasesOnDisk)) {
+        const dirNumPattern = /^(?:[A-Z][A-Z0-9]*-)?(\d+)-/;
+        for (const entry of fs.readdirSync(phasesOnDisk)) {
+          const match = entry.match(dirNumPattern);
+          if (!match) continue;
+          const num = parseInt(match[1], 10);
+          if (num >= 999) continue; // skip backlog orphans
+          if (num > maxPhase) maxPhase = num;
+        }
+      }
+
      _newPhaseId = maxPhase + 1;
      const paddedNum = String(_newPhaseId).padStart(2, '0');
      _dirName = `${prefix}${paddedNum}-${slug}`;
@@ -416,22 +434,31 @@ function cmdPhaseInsert(cwd, afterPhase, description, raw) {
      error(`Phase ${afterPhase} not found in ROADMAP.md`);
    }

-    // Calculate next decimal using existing logic
+    // Calculate next decimal by scanning both directories AND ROADMAP.md entries
    const phasesDir = path.join(planningDir(cwd), 'phases');
    const normalizedBase = normalizePhaseName(afterPhase);
-    let existingDecimals = [];
+    const decimalSet = new Set();

    try {
      const entries = fs.readdirSync(phasesDir, { withFileTypes: true });
      const dirs = entries.filter(e => e.isDirectory()).map(e => e.name);
-      const decimalPattern = new RegExp(`^(?:[A-Z]{1,6}-)?${normalizedBase}\\.(\\d+)`);
+      const decimalPattern = new RegExp(`^(?:[A-Z]{1,6}-)?${escapeRegex(normalizedBase)}\\.(\\d+)`);
      for (const dir of dirs) {
        const dm = dir.match(decimalPattern);
-        if (dm) existingDecimals.push(parseInt(dm[1], 10));
+        if (dm) decimalSet.add(parseInt(dm[1], 10));
      }
    } catch { /* intentionally empty */ }

-    const nextDecimal = existingDecimals.length === 0 ? 1 : Math.max(...existingDecimals) + 1;
+    // Also scan ROADMAP.md content (already loaded) for decimal entries
+    const rmPhasePattern = new RegExp(
+      `#{2,4}\\s*Phase\\s+0*${escapeRegex(normalizedBase)}\\.(\\d+)\\s*:`, 'gi'
+    );
+    let rmMatch;
+    while ((rmMatch = rmPhasePattern.exec(rawContent)) !== null) {
+      decimalSet.add(parseInt(rmMatch[1], 10));
+    }
+
+    const nextDecimal = decimalSet.size === 0 ? 1 : Math.max(...decimalSet) + 1;
    const _decimalPhase = `${normalizedBase}.${nextDecimal}`;
    // Optional project code prefix
    const insertConfig = loadConfig(cwd);
@@ -624,19 +651,20 @@ function cmdPhaseRemove(cwd, targetPhase, options, raw) {
  // Update ROADMAP.md
  updateRoadmapAfterPhaseRemoval(roadmapPath, targetPhase, isDecimal, parseInt(normalized, 10), cwd);

-  // Update STATE.md phase count
+  // Update STATE.md phase count atomically (#P4.4)
  const statePath = path.join(planningDir(cwd), 'STATE.md');
  if (fs.existsSync(statePath)) {
-    let stateContent = fs.readFileSync(statePath, 'utf-8');
-    const totalRaw = stateExtractField(stateContent, 'Total Phases');
-    if (totalRaw) {
-      stateContent = stateReplaceField(stateContent, 'Total Phases', String(parseInt(totalRaw, 10) - 1)) || stateContent;
-    }
-    const ofMatch = stateContent.match(/(\bof\s+)(\d+)(\s*(?:\(|phases?))/i);
-    if (ofMatch) {
-      stateContent = stateContent.replace(/(\bof\s+)(\d+)(\s*(?:\(|phases?))/i, `$1${parseInt(ofMatch[2], 10) - 1}$3`);
-    }
-    writeStateMd(statePath, stateContent, cwd);
+    readModifyWriteStateMd(statePath, (stateContent) => {
+      const totalRaw = stateExtractField(stateContent, 'Total Phases');
+      if (totalRaw) {
+        stateContent = stateReplaceField(stateContent, 'Total Phases', String(parseInt(totalRaw, 10) - 1)) || stateContent;
+      }
+      const ofMatch = stateContent.match(/(\bof\s+)(\d+)(\s*(?:\(|phases?))/i);
+      if (ofMatch) {
+        stateContent = stateContent.replace(/(\bof\s+)(\d+)(\s*(?:\(|phases?))/i, `$1${parseInt(ofMatch[2], 10) - 1}$3`);
+      }
+      return stateContent;
+    }, cwd);
  }

  output({
@@ -701,7 +729,7 @@ function cmdPhaseComplete(cwd, phaseNum, raw) {
        `(-\\s*\\[)[ ](\\]\\s*.*Phase\\s+${escapeRegex(phaseNum)}[:\\s][^\\n]*)`,
        'i'
      );
-      roadmapContent = replaceInCurrentMilestone(roadmapContent, checkboxPattern, `$1x$2 (completed ${today})`);
+      roadmapContent = roadmapContent.replace(checkboxPattern, `$1x$2 (completed ${today})`);

      // Progress table: update Status to Complete, add date (handles 4 or 5 column tables)
      const phaseEscaped = escapeRegex(phaseNum);
@@ -725,13 +753,20 @@ function cmdPhaseComplete(cwd, phaseNum, raw) {
        return '|' + cells.join('|') + '|';
      });

-      // Update plan count in phase section
+      // Update plan count in phase section.
+      // Use direct .replace() rather than replaceInCurrentMilestone() so this
+      // works when the current milestone section is itself inside a <details>
+      // block (the standard /gsd-new-project layout). replaceInCurrentMilestone
+      // scopes to content after the last </details>, which misses content inside
+      // the current milestone's own <details> wrapper (#2005).
+      // The phase-scoped heading pattern is specific enough to avoid matching
+      // archived phases (which belong to different milestones).
      const planCountPattern = new RegExp(
        `(#{2,4}\\s*Phase\\s+${phaseEscaped}[\\s\\S]*?\\*\\*Plans:\\*\\*\\s*)[^\\n]+`,
        'i'
      );
-      roadmapContent = replaceInCurrentMilestone(
-        roadmapContent, planCountPattern,
+      roadmapContent = roadmapContent.replace(
+        planCountPattern,
        `$1${summaryCount}/${planCount} plans complete`
      );

@@ -834,71 +869,72 @@ function cmdPhaseComplete(cwd, phaseNum, raw) {
    } catch { /* intentionally empty */ }
  }

-  // Update STATE.md — use shared helpers that handle both **bold:** and plain Field: formats
+  // Update STATE.md atomically — hold lock across read-modify-write (#P4.4).
+  // Previously read outside the lock; a crash between the ROADMAP update
+  // (locked above) and this write left ROADMAP/STATE inconsistent.
  if (fs.existsSync(statePath)) {
-    let stateContent = fs.readFileSync(statePath, 'utf-8');
-
-    // Update Current Phase — preserve "X of Y (Name)" compound format
-    const phaseValue = nextPhaseNum || phaseNum;
-    const existingPhaseField = stateExtractField(stateContent, 'Current Phase')
-      || stateExtractField(stateContent, 'Phase');
-    let newPhaseValue = String(phaseValue);
-    if (existingPhaseField) {
-      const totalMatch = existingPhaseField.match(/of\s+(\d+)/);
-      const nameMatch = existingPhaseField.match(/\(([^)]+)\)/);
-      if (totalMatch) {
-        const total = totalMatch[1];
-        const nameStr = nextPhaseName ? ` (${nextPhaseName.replace(/-/g, ' ')})` : (nameMatch ? ` (${nameMatch[1]})` : '');
-        newPhaseValue = `${phaseValue} of ${total}${nameStr}`;
-      }
-    }
-    stateContent = stateReplaceFieldWithFallback(stateContent, 'Current Phase', 'Phase', newPhaseValue);
-
-    // Update Current Phase Name
-    if (nextPhaseName) {
-      stateContent = stateReplaceFieldWithFallback(stateContent, 'Current Phase Name', null, nextPhaseName.replace(/-/g, ' '));
-    }
-
-    // Update Status
-    stateContent = stateReplaceFieldWithFallback(stateContent, 'Status', null,
-      isLastPhase ? 'Milestone complete' : 'Ready to plan');
-
-    // Update Current Plan
-    stateContent = stateReplaceFieldWithFallback(stateContent, 'Current Plan', 'Plan', 'Not started');
-
-    // Update Last Activity
-    stateContent = stateReplaceFieldWithFallback(stateContent, 'Last Activity', 'Last activity', today);
-
-    // Update Last Activity Description
-    stateContent = stateReplaceFieldWithFallback(stateContent, 'Last Activity Description', null,
-      `Phase ${phaseNum} complete${nextPhaseNum ? `, transitioned to Phase ${nextPhaseNum}` : ''}`);
-
-    // Increment Completed Phases counter (#956)
-    const completedRaw = stateExtractField(stateContent, 'Completed Phases');
-    if (completedRaw) {
-      const newCompleted = parseInt(completedRaw, 10) + 1;
-      stateContent = stateReplaceField(stateContent, 'Completed Phases', String(newCompleted)) || stateContent;
-
-      // Recalculate percent based on completed / total (#956)
-      const totalRaw = stateExtractField(stateContent, 'Total Phases');
-      if (totalRaw) {
-        const totalPhases = parseInt(totalRaw, 10);
-        if (totalPhases > 0) {
-          const newPercent = Math.round((newCompleted / totalPhases) * 100);
-          stateContent = stateReplaceField(stateContent, 'Progress', `${newPercent}%`) || stateContent;
-          // Also update percent field if it exists separately
-          stateContent = stateContent.replace(
-            /(percent:\s*)\d+/,
-            `$1${newPercent}`
-          );
+    readModifyWriteStateMd(statePath, (stateContent) => {
+      // Update Current Phase — preserve "X of Y (Name)" compound format
+      const phaseValue = nextPhaseNum || phaseNum;
+      const existingPhaseField = stateExtractField(stateContent, 'Current Phase')
+        || stateExtractField(stateContent, 'Phase');
+      let newPhaseValue = String(phaseValue);
+      if (existingPhaseField) {
+        const totalMatch = existingPhaseField.match(/of\s+(\d+)/);
+        const nameMatch = existingPhaseField.match(/\(([^)]+)\)/);
+        if (totalMatch) {
+          const total = totalMatch[1];
+          const nameStr = nextPhaseName ? ` (${nextPhaseName.replace(/-/g, ' ')})` : (nameMatch ? ` (${nameMatch[1]})` : '');
+          newPhaseValue = `${phaseValue} of ${total}${nameStr}`;
        }
      }
-    }
+      stateContent = stateReplaceFieldWithFallback(stateContent, 'Current Phase', 'Phase', newPhaseValue);

-    // Gate 4: Update Performance Metrics section (#1627)
-    stateContent = updatePerformanceMetricsSection(stateContent, cwd, phaseNum, planCount, summaryCount);
+      // Update Current Phase Name
+      if (nextPhaseName) {
+        stateContent = stateReplaceFieldWithFallback(stateContent, 'Current Phase Name', null, nextPhaseName.replace(/-/g, ' '));
+      }

-    writeStateMd(statePath, stateContent, cwd);
+      // Update Status
+      stateContent = stateReplaceFieldWithFallback(stateContent, 'Status', null,
+        isLastPhase ? 'Milestone complete' : 'Ready to plan');
+
+      // Update Current Plan
+      stateContent = stateReplaceFieldWithFallback(stateContent, 'Current Plan', 'Plan', 'Not started');
+
+      // Update Last Activity
+      stateContent = stateReplaceFieldWithFallback(stateContent, 'Last Activity', 'Last activity', today);
+
+      // Update Last Activity Description
+      stateContent = stateReplaceFieldWithFallback(stateContent, 'Last Activity Description', null,
+        `Phase ${phaseNum} complete${nextPhaseNum ? `, transitioned to Phase ${nextPhaseNum}` : ''}`);
+
+      // Increment Completed Phases counter (#956)
+      const completedRaw = stateExtractField(stateContent, 'Completed Phases');
+      if (completedRaw) {
+        const newCompleted = parseInt(completedRaw, 10) + 1;
+        stateContent = stateReplaceField(stateContent, 'Completed Phases', String(newCompleted)) || stateContent;
+
+        // Recalculate percent based on completed / total (#956)
+        const totalRaw = stateExtractField(stateContent, 'Total Phases');
+        if (totalRaw) {
+          const totalPhases = parseInt(totalRaw, 10);
+          if (totalPhases > 0) {
+            const newPercent = Math.round((newCompleted / totalPhases) * 100);
+            stateContent = stateReplaceField(stateContent, 'Progress', `${newPercent}%`) || stateContent;
+            stateContent = stateContent.replace(
+              /(percent:\s*)\d+/,
+              `$1${newPercent}`
+            );
+          }
+        }
+      }
+
+      // Gate 4: Update Performance Metrics section (#1627)
+      stateContent = updatePerformanceMetricsSection(stateContent, cwd, phaseNum, planCount, summaryCount);
+
+      return stateContent;
+    }, cwd);
  }

  const result = {
--- a/get-shit-done/bin/lib/roadmap.cjs
+++ b/get-shit-done/bin/lib/roadmap.cjs
@@ -4,7 +4,7 @@

 const fs = require('fs');
 const path = require('path');
-const { escapeRegex, normalizePhaseName, planningPaths, withPlanningLock, output, error, findPhaseInternal, stripShippedMilestones, extractCurrentMilestone, replaceInCurrentMilestone, phaseTokenMatches } = require('./core.cjs');
+const { escapeRegex, normalizePhaseName, planningPaths, withPlanningLock, output, error, findPhaseInternal, stripShippedMilestones, extractCurrentMilestone, replaceInCurrentMilestone, phaseTokenMatches, atomicWriteFileSync } = require('./core.cjs');

 /**
 * Search for a phase header (and its section) within the given content string.
@@ -129,6 +129,15 @@ function cmdRoadmapAnalyze(cwd, raw) {
  const phases = [];
  let match;

+  // Build phase directory lookup once (O(1) readdir instead of O(N) per phase)
+  const _phaseDirNames = (() => {
+    try {
+      return fs.readdirSync(phasesDir, { withFileTypes: true })
+        .filter(e => e.isDirectory())
+        .map(e => e.name);
+    } catch { return []; }
+  })();
+
  while ((match = phasePattern.exec(content)) !== null) {
    const phaseNum = match[1];
    const phaseName = match[2].replace(/\(INSERTED\)/i, '').trim();
@@ -155,9 +164,7 @@ function cmdRoadmapAnalyze(cwd, raw) {
    let hasResearch = false;

    try {
-      const entries = fs.readdirSync(phasesDir, { withFileTypes: true });
-      const dirs = entries.filter(e => e.isDirectory()).map(e => e.name);
-      const dirMatch = dirs.find(d => phaseTokenMatches(d, normalized));
+      const dirMatch = _phaseDirNames.find(d => phaseTokenMatches(d, normalized));

      if (dirMatch) {
        const phaseFiles = fs.readdirSync(path.join(phasesDir, dirMatch));
@@ -334,7 +341,7 @@ function cmdRoadmapUpdatePlanProgress(cwd, phaseNum, raw) {
      roadmapContent = roadmapContent.replace(planCheckboxPattern, '$1x$2');
    }

-    fs.writeFileSync(roadmapPath, roadmapContent, 'utf-8');
+    atomicWriteFileSync(roadmapPath, roadmapContent, 'utf-8');
  });
  output({
    updated: true,
--- a/get-shit-done/bin/lib/state.cjs
+++ b/get-shit-done/bin/lib/state.cjs
@@ -4,7 +4,7 @@

 const fs = require('fs');
 const path = require('path');
-const { escapeRegex, loadConfig, getMilestoneInfo, getMilestonePhaseFilter, normalizeMd, planningDir, planningPaths, output, error } = require('./core.cjs');
+const { escapeRegex, loadConfig, getMilestoneInfo, getMilestonePhaseFilter, normalizeMd, planningDir, planningPaths, output, error, atomicWriteFileSync } = require('./core.cjs');
 const { extractFrontmatter, reconstructFrontmatter } = require('./frontmatter.cjs');

 /** Shorthand — every state command needs this path */
@@ -12,6 +12,16 @@ function getStatePath(cwd) {
  return planningPaths(cwd).state;
 }

+// Track all lock files held by this process so they can be removed on exit.
+// process.on('exit') fires even on process.exit(1), unlike try/finally which is
+// skipped when error() calls process.exit(1) inside a locked region (#1916).
+const _heldStateLocks = new Set();
+process.on('exit', () => {
+  for (const lockPath of _heldStateLocks) {
+    try { require('fs').unlinkSync(lockPath); } catch { /* already gone */ }
+  }
+});
+
 // Shared helper: extract a field value from STATE.md content.
 // Supports both **Field:** bold and plain Field: format.
 function stateExtractField(content, fieldName) {
@@ -184,18 +194,22 @@ function cmdStateUpdate(cwd, field, value) {

  const statePath = planningPaths(cwd).state;
  try {
-    let content = fs.readFileSync(statePath, 'utf-8');
-    const fieldEscaped = escapeRegex(field);
-    // Try **Field:** bold format first, then plain Field: format
-    const boldPattern = new RegExp(`(\\*\\*${fieldEscaped}:\\*\\*\\s*)(.*)`, 'i');
-    const plainPattern = new RegExp(`(^${fieldEscaped}:\\s*)(.*)`, 'im');
-    if (boldPattern.test(content)) {
-      content = content.replace(boldPattern, (_match, prefix) => `${prefix}${value}`);
-      writeStateMd(statePath, content, cwd);
-      output({ updated: true });
-    } else if (plainPattern.test(content)) {
-      content = content.replace(plainPattern, (_match, prefix) => `${prefix}${value}`);
-      writeStateMd(statePath, content, cwd);
+    let updated = false;
+    readModifyWriteStateMd(statePath, (content) => {
+      const fieldEscaped = escapeRegex(field);
+      // Try **Field:** bold format first, then plain Field: format
+      const boldPattern = new RegExp(`(\\*\\*${fieldEscaped}:\\*\\*\\s*)(.*)`, 'i');
+      const plainPattern = new RegExp(`(^${fieldEscaped}:\\s*)(.*)`, 'im');
+      if (boldPattern.test(content)) {
+        updated = true;
+        return content.replace(boldPattern, (_match, prefix) => `${prefix}${value}`);
+      } else if (plainPattern.test(content)) {
+        updated = true;
+        return content.replace(plainPattern, (_match, prefix) => `${prefix}${value}`);
+      }
+      return content;
+    }, cwd);
+    if (updated) {
      output({ updated: true });
    } else {
      output({ updated: false, reason: `Field "${field}" not found in STATE.md` });
@@ -274,55 +288,67 @@ function cmdStateAdvancePlan(cwd, raw) {
  const statePath = planningPaths(cwd).state;
  if (!fs.existsSync(statePath)) { output({ error: 'STATE.md not found' }, raw); return; }

-  let content = fs.readFileSync(statePath, 'utf-8');
  const today = new Date().toISOString().split('T')[0];
+  let result = null;

-  // Try legacy separate fields first, then compound "Plan: X of Y" format
-  const legacyPlan = stateExtractField(content, 'Current Plan');
-  const legacyTotal = stateExtractField(content, 'Total Plans in Phase');
-  const planField = stateExtractField(content, 'Plan');
+  readModifyWriteStateMd(statePath, (content) => {
+    // Try legacy separate fields first, then compound "Plan: X of Y" format
+    const legacyPlan = stateExtractField(content, 'Current Plan');
+    const legacyTotal = stateExtractField(content, 'Total Plans in Phase');
+    const planField = stateExtractField(content, 'Plan');

-  let currentPlan, totalPlans;
-  let useCompoundFormat = false;
+    let currentPlan, totalPlans;
+    let useCompoundFormat = false;

-  if (legacyPlan && legacyTotal) {
-    currentPlan = parseInt(legacyPlan, 10);
-    totalPlans = parseInt(legacyTotal, 10);
-  } else if (planField) {
-    // Compound format: "2 of 6 in current phase" or "2 of 6"
-    currentPlan = parseInt(planField, 10);
-    const ofMatch = planField.match(/of\s+(\d+)/);
-    totalPlans = ofMatch ? parseInt(ofMatch[1], 10) : NaN;
-    useCompoundFormat = true;
-  }
+    if (legacyPlan && legacyTotal) {
+      currentPlan = parseInt(legacyPlan, 10);
+      totalPlans = parseInt(legacyTotal, 10);
+    } else if (planField) {
+      // Compound format: "2 of 6 in current phase" or "2 of 6"
+      currentPlan = parseInt(planField, 10);
+      const ofMatch = planField.match(/of\s+(\d+)/);
+      totalPlans = ofMatch ? parseInt(ofMatch[1], 10) : NaN;
+      useCompoundFormat = true;
+    }

-  if (isNaN(currentPlan) || isNaN(totalPlans)) {
+    if (isNaN(currentPlan) || isNaN(totalPlans)) {
+      result = { error: true };
+      return content;
+    }
+
+    if (currentPlan >= totalPlans) {
+      content = stateReplaceFieldWithFallback(content, 'Status', null, 'Phase complete — ready for verification');
+      content = stateReplaceFieldWithFallback(content, 'Last Activity', 'Last activity', today);
+      content = updateCurrentPositionFields(content, { status: 'Phase complete — ready for verification', lastActivity: today });
+      result = { advanced: false, reason: 'last_plan', current_plan: currentPlan, total_plans: totalPlans, status: 'ready_for_verification' };
+    } else {
+      const newPlan = currentPlan + 1;
+      let planDisplayValue;
+      if (useCompoundFormat) {
+        // Preserve compound format: "X of Y in current phase" → replace X only
+        planDisplayValue = planField.replace(/^\d+/, String(newPlan));
+        content = stateReplaceField(content, 'Plan', planDisplayValue) || content;
+      } else {
+        planDisplayValue = `${newPlan} of ${totalPlans}`;
+        content = stateReplaceField(content, 'Current Plan', String(newPlan)) || content;
+      }
+      content = stateReplaceFieldWithFallback(content, 'Status', null, 'Ready to execute');
+      content = stateReplaceFieldWithFallback(content, 'Last Activity', 'Last activity', today);
+      content = updateCurrentPositionFields(content, { status: 'Ready to execute', lastActivity: today, plan: planDisplayValue });
+      result = { advanced: true, previous_plan: currentPlan, current_plan: newPlan, total_plans: totalPlans };
+    }
+    return content;
+  }, cwd);
+
+  if (!result || result.error) {
    output({ error: 'Cannot parse Current Plan or Total Plans in Phase from STATE.md' }, raw);
    return;
  }

-  if (currentPlan >= totalPlans) {
-    content = stateReplaceFieldWithFallback(content, 'Status', null, 'Phase complete — ready for verification');
-    content = stateReplaceFieldWithFallback(content, 'Last Activity', 'Last activity', today);
-    content = updateCurrentPositionFields(content, { status: 'Phase complete — ready for verification', lastActivity: today });
-    writeStateMd(statePath, content, cwd);
-    output({ advanced: false, reason: 'last_plan', current_plan: currentPlan, total_plans: totalPlans, status: 'ready_for_verification' }, raw, 'false');
+  if (result.advanced === false) {
+    output(result, raw, 'false');
  } else {
-    const newPlan = currentPlan + 1;
-    let planDisplayValue;
-    if (useCompoundFormat) {
-      // Preserve compound format: "X of Y in current phase" → replace X only
-      planDisplayValue = planField.replace(/^\d+/, String(newPlan));
-      content = stateReplaceField(content, 'Plan', planDisplayValue) || content;
-    } else {
-      planDisplayValue = `${newPlan} of ${totalPlans}`;
-      content = stateReplaceField(content, 'Current Plan', String(newPlan)) || content;
-    }
-    content = stateReplaceFieldWithFallback(content, 'Status', null, 'Ready to execute');
-    content = stateReplaceFieldWithFallback(content, 'Last Activity', 'Last activity', today);
-    content = updateCurrentPositionFields(content, { status: 'Ready to execute', lastActivity: today, plan: planDisplayValue });
-    writeStateMd(statePath, content, cwd);
-    output({ advanced: true, previous_plan: currentPlan, current_plan: newPlan, total_plans: totalPlans }, raw, 'true');
+    output(result, raw, 'true');
  }
 }

@@ -330,7 +356,6 @@ function cmdStateRecordMetric(cwd, options, raw) {
  const statePath = planningPaths(cwd).state;
  if (!fs.existsSync(statePath)) { output({ error: 'STATE.md not found' }, raw); return; }

-  let content = fs.readFileSync(statePath, 'utf-8');
  const { phase, plan, duration, tasks, files } = options;

  if (!phase || !plan || !duration) {
@@ -338,22 +363,29 @@ function cmdStateRecordMetric(cwd, options, raw) {
    return;
  }

-  // Find Performance Metrics section and its table
-  const metricsPattern = /(##\s*Performance Metrics[\s\S]*?\n\|[^\n]+\n\|[-|\s]+\n)([\s\S]*?)(?=\n##|\n$|$)/i;
-  const metricsMatch = content.match(metricsPattern);
+  let recorded = false;
+  readModifyWriteStateMd(statePath, (content) => {
+    // Find Performance Metrics section and its table
+    const metricsPattern = /(##\s*Performance Metrics[\s\S]*?\n\|[^\n]+\n\|[-|\s]+\n)([\s\S]*?)(?=\n##|\n$|$)/i;
+    const metricsMatch = content.match(metricsPattern);

-  if (metricsMatch) {
-    let tableBody = metricsMatch[2].trimEnd();
-    const newRow = `| Phase ${phase} P${plan} | ${duration} | ${tasks || '-'} tasks | ${files || '-'} files |`;
+    if (metricsMatch) {
+      let tableBody = metricsMatch[2].trimEnd();
+      const newRow = `| Phase ${phase} P${plan} | ${duration} | ${tasks || '-'} tasks | ${files || '-'} files |`;

-    if (tableBody.trim() === '' || tableBody.includes('None yet')) {
-      tableBody = newRow;
-    } else {
-      tableBody = tableBody + '\n' + newRow;
+      if (tableBody.trim() === '' || tableBody.includes('None yet')) {
+        tableBody = newRow;
+      } else {
+        tableBody = tableBody + '\n' + newRow;
+      }
+
+      recorded = true;
+      return content.replace(metricsPattern, (_match, header) => `${header}${tableBody}\n`);
    }
+    return content;
+  }, cwd);

-    content = content.replace(metricsPattern, (_match, header) => `${header}${tableBody}\n`);
-    writeStateMd(statePath, content, cwd);
+  if (recorded) {
    output({ recorded: true, phase, plan, duration }, raw, 'true');
  } else {
    output({ recorded: false, reason: 'Performance Metrics section not found in STATE.md' }, raw, 'false');
@@ -364,9 +396,7 @@ function cmdStateUpdateProgress(cwd, raw) {
  const statePath = planningPaths(cwd).state;
  if (!fs.existsSync(statePath)) { output({ error: 'STATE.md not found' }, raw); return; }

-  let content = fs.readFileSync(statePath, 'utf-8');
-
-  // Count summaries across current milestone phases only
+  // Count summaries across current milestone phases only (outside lock — read-only)
  const phasesDir = planningPaths(cwd).phases;
  let totalPlans = 0;
  let totalSummaries = 0;
@@ -389,17 +419,26 @@ function cmdStateUpdateProgress(cwd, raw) {
  const bar = '\u2588'.repeat(filled) + '\u2591'.repeat(barWidth - filled);
  const progressStr = `[${bar}] ${percent}%`;

-  // Try **Progress:** bold format first, then plain Progress: format
-  const boldProgressPattern = /(\*\*Progress:\*\*\s*).*/i;
-  const plainProgressPattern = /^(Progress:\s*).*/im;
-  if (boldProgressPattern.test(content)) {
-    content = content.replace(boldProgressPattern, (_match, prefix) => `${prefix}${progressStr}`);
-    writeStateMd(statePath, content, cwd);
-    output({ updated: true, percent, completed: totalSummaries, total: totalPlans, bar: progressStr }, raw, progressStr);
-  } else if (plainProgressPattern.test(content)) {
-    content = content.replace(plainProgressPattern, (_match, prefix) => `${prefix}${progressStr}`);
-    writeStateMd(statePath, content, cwd);
-    output({ updated: true, percent, completed: totalSummaries, total: totalPlans, bar: progressStr }, raw, progressStr);
+  let updated = false;
+  const _totalPlans = totalPlans;
+  const _totalSummaries = totalSummaries;
+
+  readModifyWriteStateMd(statePath, (content) => {
+    // Try **Progress:** bold format first, then plain Progress: format
+    const boldProgressPattern = /(\*\*Progress:\*\*\s*).*/i;
+    const plainProgressPattern = /^(Progress:\s*).*/im;
+    if (boldProgressPattern.test(content)) {
+      updated = true;
+      return content.replace(boldProgressPattern, (_match, prefix) => `${prefix}${progressStr}`);
+    } else if (plainProgressPattern.test(content)) {
+      updated = true;
+      return content.replace(plainProgressPattern, (_match, prefix) => `${prefix}${progressStr}`);
+    }
+    return content;
+  }, cwd);
+
+  if (updated) {
+    output({ updated: true, percent, completed: _totalSummaries, total: _totalPlans, bar: progressStr }, raw, progressStr);
  } else {
    output({ updated: false, reason: 'Progress field not found in STATE.md' }, raw, 'false');
  }
@@ -423,20 +462,26 @@ function cmdStateAddDecision(cwd, options, raw) {

  if (!summaryText) { output({ error: 'summary required' }, raw); return; }

-  let content = fs.readFileSync(statePath, 'utf-8');
  const entry = `- [Phase ${phase || '?'}]: ${summaryText}${rationaleText ? ` — ${rationaleText}` : ''}`;
+  let added = false;

-  // Find Decisions section (various heading patterns)
-  const sectionPattern = /(###?\s*(?:Decisions|Decisions Made|Accumulated.*Decisions)\s*\n)([\s\S]*?)(?=\n###?|\n##[^#]|$)/i;
-  const match = content.match(sectionPattern);
+  readModifyWriteStateMd(statePath, (content) => {
+    // Find Decisions section (various heading patterns)
+    const sectionPattern = /(###?\s*(?:Decisions|Decisions Made|Accumulated.*Decisions)\s*\n)([\s\S]*?)(?=\n###?|\n##[^#]|$)/i;
+    const match = content.match(sectionPattern);

-  if (match) {
-    let sectionBody = match[2];
-    // Remove placeholders
-    sectionBody = sectionBody.replace(/None yet\.?\s*\n?/gi, '').replace(/No decisions yet\.?\s*\n?/gi, '');
-    sectionBody = sectionBody.trimEnd() + '\n' + entry + '\n';
-    content = content.replace(sectionPattern, (_match, header) => `${header}${sectionBody}`);
-    writeStateMd(statePath, content, cwd);
+    if (match) {
+      let sectionBody = match[2];
+      // Remove placeholders
+      sectionBody = sectionBody.replace(/None yet\.?\s*\n?/gi, '').replace(/No decisions yet\.?\s*\n?/gi, '');
+      sectionBody = sectionBody.trimEnd() + '\n' + entry + '\n';
+      added = true;
+      return content.replace(sectionPattern, (_match, header) => `${header}${sectionBody}`);
+    }
+    return content;
+  }, cwd);
+
+  if (added) {
    output({ added: true, decision: entry }, raw, 'true');
  } else {
    output({ added: false, reason: 'Decisions section not found in STATE.md' }, raw, 'false');
@@ -458,18 +503,24 @@ function cmdStateAddBlocker(cwd, text, raw) {

  if (!blockerText) { output({ error: 'text required' }, raw); return; }

-  let content = fs.readFileSync(statePath, 'utf-8');
  const entry = `- ${blockerText}`;
+  let added = false;

-  const sectionPattern = /(###?\s*(?:Blockers|Blockers\/Concerns|Concerns)\s*\n)([\s\S]*?)(?=\n###?|\n##[^#]|$)/i;
-  const match = content.match(sectionPattern);
+  readModifyWriteStateMd(statePath, (content) => {
+    const sectionPattern = /(###?\s*(?:Blockers|Blockers\/Concerns|Concerns)\s*\n)([\s\S]*?)(?=\n###?|\n##[^#]|$)/i;
+    const match = content.match(sectionPattern);

-  if (match) {
-    let sectionBody = match[2];
-    sectionBody = sectionBody.replace(/None\.?\s*\n?/gi, '').replace(/None yet\.?\s*\n?/gi, '');
-    sectionBody = sectionBody.trimEnd() + '\n' + entry + '\n';
-    content = content.replace(sectionPattern, (_match, header) => `${header}${sectionBody}`);
-    writeStateMd(statePath, content, cwd);
+    if (match) {
+      let sectionBody = match[2];
+      sectionBody = sectionBody.replace(/None\.?\s*\n?/gi, '').replace(/None yet\.?\s*\n?/gi, '');
+      sectionBody = sectionBody.trimEnd() + '\n' + entry + '\n';
+      added = true;
+      return content.replace(sectionPattern, (_match, header) => `${header}${sectionBody}`);
+    }
+    return content;
+  }, cwd);
+
+  if (added) {
    output({ added: true, blocker: blockerText }, raw, 'true');
  } else {
    output({ added: false, reason: 'Blockers section not found in STATE.md' }, raw, 'false');
@@ -481,27 +532,33 @@ function cmdStateResolveBlocker(cwd, text, raw) {
  if (!fs.existsSync(statePath)) { output({ error: 'STATE.md not found' }, raw); return; }
  if (!text) { output({ error: 'text required' }, raw); return; }

-  let content = fs.readFileSync(statePath, 'utf-8');
+  let resolved = false;

-  const sectionPattern = /(###?\s*(?:Blockers|Blockers\/Concerns|Concerns)\s*\n)([\s\S]*?)(?=\n###?|\n##[^#]|$)/i;
-  const match = content.match(sectionPattern);
+  readModifyWriteStateMd(statePath, (content) => {
+    const sectionPattern = /(###?\s*(?:Blockers|Blockers\/Concerns|Concerns)\s*\n)([\s\S]*?)(?=\n###?|\n##[^#]|$)/i;
+    const match = content.match(sectionPattern);

-  if (match) {
-    const sectionBody = match[2];
-    const lines = sectionBody.split('\n');
-    const filtered = lines.filter(line => {
-      if (!line.startsWith('- ')) return true;
-      return !line.toLowerCase().includes(text.toLowerCase());
-    });
+    if (match) {
+      const sectionBody = match[2];
+      const lines = sectionBody.split('\n');
+      const filtered = lines.filter(line => {
+        if (!line.startsWith('- ')) return true;
+        return !line.toLowerCase().includes(text.toLowerCase());
+      });

-    let newBody = filtered.join('\n');
-    // If section is now empty, add placeholder
-    if (!newBody.trim() || !newBody.includes('- ')) {
-      newBody = 'None\n';
+      let newBody = filtered.join('\n');
+      // If section is now empty, add placeholder
+      if (!newBody.trim() || !newBody.includes('- ')) {
+        newBody = 'None\n';
+      }
+
+      resolved = true;
+      return content.replace(sectionPattern, (_match, header) => `${header}${newBody}`);
    }
+    return content;
+  }, cwd);

-    content = content.replace(sectionPattern, (_match, header) => `${header}${newBody}`);
-    writeStateMd(statePath, content, cwd);
+  if (resolved) {
    output({ resolved: true, blocker: text }, raw, 'true');
  } else {
    output({ resolved: false, reason: 'Blockers section not found in STATE.md' }, raw, 'false');
@@ -512,31 +569,33 @@ function cmdStateRecordSession(cwd, options, raw) {
  const statePath = planningPaths(cwd).state;
  if (!fs.existsSync(statePath)) { output({ error: 'STATE.md not found' }, raw); return; }

-  let content = fs.readFileSync(statePath, 'utf-8');
  const now = new Date().toISOString();
  const updated = [];

-  // Update Last session / Last Date
-  let result = stateReplaceField(content, 'Last session', now);
-  if (result) { content = result; updated.push('Last session'); }
-  result = stateReplaceField(content, 'Last Date', now);
-  if (result) { content = result; updated.push('Last Date'); }
+  readModifyWriteStateMd(statePath, (content) => {
+    // Update Last session / Last Date
+    let result = stateReplaceField(content, 'Last session', now);
+    if (result) { content = result; updated.push('Last session'); }
+    result = stateReplaceField(content, 'Last Date', now);
+    if (result) { content = result; updated.push('Last Date'); }

-  // Update Stopped at
-  if (options.stopped_at) {
-    result = stateReplaceField(content, 'Stopped At', options.stopped_at);
-    if (!result) result = stateReplaceField(content, 'Stopped at', options.stopped_at);
-    if (result) { content = result; updated.push('Stopped At'); }
-  }
+    // Update Stopped at
+    if (options.stopped_at) {
+      result = stateReplaceField(content, 'Stopped At', options.stopped_at);
+      if (!result) result = stateReplaceField(content, 'Stopped at', options.stopped_at);
+      if (result) { content = result; updated.push('Stopped At'); }
+    }

-  // Update Resume file
-  const resumeFile = options.resume_file || 'None';
-  result = stateReplaceField(content, 'Resume File', resumeFile);
-  if (!result) result = stateReplaceField(content, 'Resume file', resumeFile);
-  if (result) { content = result; updated.push('Resume File'); }
+    // Update Resume file
+    const resumeFile = options.resume_file || 'None';
+    result = stateReplaceField(content, 'Resume File', resumeFile);
+    if (!result) result = stateReplaceField(content, 'Resume file', resumeFile);
+    if (result) { content = result; updated.push('Resume File'); }
+
+    return content;
+  }, cwd);

  if (updated.length > 0) {
-    writeStateMd(statePath, content, cwd);
    output({ recorded: true, updated }, raw, 'true');
  } else {
    output({ recorded: false, reason: 'No session fields found in STATE.md' }, raw, 'false');
@@ -805,6 +864,9 @@ function acquireStateLock(statePath) {
      const fd = fs.openSync(lockPath, fs.constants.O_CREAT | fs.constants.O_EXCL | fs.constants.O_WRONLY);
      fs.writeSync(fd, String(process.pid));
      fs.closeSync(fd);
+      // Register for exit-time cleanup so process.exit(1) inside a locked region
+      // cannot leave a stale lock file (#1916).
+      _heldStateLocks.add(lockPath);
      return lockPath;
    } catch (err) {
      if (err.code === 'EEXIST') {
@@ -821,8 +883,7 @@ function acquireStateLock(statePath) {
          return lockPath;
        }
        const jitter = Math.floor(Math.random() * 50);
-        const start = Date.now();
-        while (Date.now() - start < retryDelay + jitter) { /* busy wait */ }
+        Atomics.wait(new Int32Array(new SharedArrayBuffer(4)), 0, 0, retryDelay + jitter);
        continue;
      }
      return lockPath; // non-EEXIST error — proceed without lock
@@ -832,6 +893,7 @@ function acquireStateLock(statePath) {
 }

 function releaseStateLock(lockPath) {
+  _heldStateLocks.delete(lockPath);
  try { fs.unlinkSync(lockPath); } catch { /* lock already gone */ }
 }

@@ -845,7 +907,7 @@ function writeStateMd(statePath, content, cwd) {
  const synced = syncStateFrontmatter(content, cwd);
  const lockPath = acquireStateLock(statePath);
  try {
-    fs.writeFileSync(statePath, normalizeMd(synced), 'utf-8');
+    atomicWriteFileSync(statePath, normalizeMd(synced), 'utf-8');
  } finally {
    releaseStateLock(lockPath);
  }
@@ -863,7 +925,7 @@ function readModifyWriteStateMd(statePath, transformFn, cwd) {
    const content = fs.existsSync(statePath) ? fs.readFileSync(statePath, 'utf-8') : '';
    const modified = transformFn(content);
    const synced = syncStateFrontmatter(modified, cwd);
-    fs.writeFileSync(statePath, normalizeMd(synced), 'utf-8');
+    atomicWriteFileSync(statePath, normalizeMd(synced), 'utf-8');
  } finally {
    releaseStateLock(lockPath);
  }
@@ -913,96 +975,95 @@ function cmdStateBeginPhase(cwd, phaseNumber, phaseName, planCount, raw) {
    return;
  }

-  let content = fs.readFileSync(statePath, 'utf-8');
  const today = new Date().toISOString().split('T')[0];
  const updated = [];

-  // Update Status field
-  const statusValue = `Executing Phase ${phaseNumber}`;
-  let result = stateReplaceField(content, 'Status', statusValue);
-  if (result) { content = result; updated.push('Status'); }
+  readModifyWriteStateMd(statePath, (content) => {
+    // Update Status field
+    const statusValue = `Executing Phase ${phaseNumber}`;
+    let result = stateReplaceField(content, 'Status', statusValue);
+    if (result) { content = result; updated.push('Status'); }

-  // Update Last Activity
-  result = stateReplaceField(content, 'Last Activity', today);
-  if (result) { content = result; updated.push('Last Activity'); }
+    // Update Last Activity
+    result = stateReplaceField(content, 'Last Activity', today);
+    if (result) { content = result; updated.push('Last Activity'); }

-  // Update Last Activity Description if it exists
-  const activityDesc = `Phase ${phaseNumber} execution started`;
-  result = stateReplaceField(content, 'Last Activity Description', activityDesc);
-  if (result) { content = result; updated.push('Last Activity Description'); }
+    // Update Last Activity Description if it exists
+    const activityDesc = `Phase ${phaseNumber} execution started`;
+    result = stateReplaceField(content, 'Last Activity Description', activityDesc);
+    if (result) { content = result; updated.push('Last Activity Description'); }

-  // Update Current Phase
-  result = stateReplaceField(content, 'Current Phase', String(phaseNumber));
-  if (result) { content = result; updated.push('Current Phase'); }
+    // Update Current Phase
+    result = stateReplaceField(content, 'Current Phase', String(phaseNumber));
+    if (result) { content = result; updated.push('Current Phase'); }

-  // Update Current Phase Name
-  if (phaseName) {
-    result = stateReplaceField(content, 'Current Phase Name', phaseName);
-    if (result) { content = result; updated.push('Current Phase Name'); }
-  }
-
-  // Update Current Plan to 1 (starting from the first plan)
-  result = stateReplaceField(content, 'Current Plan', '1');
-  if (result) { content = result; updated.push('Current Plan'); }
-
-  // Update Total Plans in Phase
-  if (planCount) {
-    result = stateReplaceField(content, 'Total Plans in Phase', String(planCount));
-    if (result) { content = result; updated.push('Total Plans in Phase'); }
-  }
-
-  // Update **Current focus:** body text line (#1104)
-  const focusLabel = phaseName ? `Phase ${phaseNumber} — ${phaseName}` : `Phase ${phaseNumber}`;
-  const focusPattern = /(\*\*Current focus:\*\*\s*).*/i;
-  if (focusPattern.test(content)) {
-    content = content.replace(focusPattern, (_match, prefix) => `${prefix}${focusLabel}`);
-    updated.push('Current focus');
-  }
-
-  // Update ## Current Position section (#1104, #1365)
-  // Update individual fields within Current Position instead of replacing the
-  // entire section, so that Status, Last activity, and Progress are preserved.
-  const positionPattern = /(##\s*Current Position\s*\n)([\s\S]*?)(?=\n##|$)/i;
-  const positionMatch = content.match(positionPattern);
-  if (positionMatch) {
-    const header = positionMatch[1];
-    let posBody = positionMatch[2];
-
-    // Update or insert Phase line
-    const newPhase = `Phase: ${phaseNumber}${phaseName ? ` (${phaseName})` : ''} — EXECUTING`;
-    if (/^Phase:/m.test(posBody)) {
-      posBody = posBody.replace(/^Phase:.*$/m, newPhase);
-    } else {
-      posBody = newPhase + '\n' + posBody;
+    // Update Current Phase Name
+    if (phaseName) {
+      result = stateReplaceField(content, 'Current Phase Name', phaseName);
+      if (result) { content = result; updated.push('Current Phase Name'); }
    }

-    // Update or insert Plan line
-    const newPlan = `Plan: 1 of ${planCount || '?'}`;
-    if (/^Plan:/m.test(posBody)) {
-      posBody = posBody.replace(/^Plan:.*$/m, newPlan);
-    } else {
-      posBody = posBody.replace(/^(Phase:.*$)/m, `$1\n${newPlan}`);
+    // Update Current Plan to 1 (starting from the first plan)
+    result = stateReplaceField(content, 'Current Plan', '1');
+    if (result) { content = result; updated.push('Current Plan'); }
+
+    // Update Total Plans in Phase
+    if (planCount) {
+      result = stateReplaceField(content, 'Total Plans in Phase', String(planCount));
+      if (result) { content = result; updated.push('Total Plans in Phase'); }
    }

-    // Update Status line if present
-    const newStatus = `Status: Executing Phase ${phaseNumber}`;
-    if (/^Status:/m.test(posBody)) {
-      posBody = posBody.replace(/^Status:.*$/m, newStatus);
+    // Update **Current focus:** body text line (#1104)
+    const focusLabel = phaseName ? `Phase ${phaseNumber} — ${phaseName}` : `Phase ${phaseNumber}`;
+    const focusPattern = /(\*\*Current focus:\*\*\s*).*/i;
+    if (focusPattern.test(content)) {
+      content = content.replace(focusPattern, (_match, prefix) => `${prefix}${focusLabel}`);
+      updated.push('Current focus');
    }

-    // Update Last activity line if present
-    const newActivity = `Last activity: ${today} -- Phase ${phaseNumber} execution started`;
-    if (/^Last activity:/im.test(posBody)) {
-      posBody = posBody.replace(/^Last activity:.*$/im, newActivity);
+    // Update ## Current Position section (#1104, #1365)
+    // Update individual fields within Current Position instead of replacing the
+    // entire section, so that Status, Last activity, and Progress are preserved.
+    const positionPattern = /(##\s*Current Position\s*\n)([\s\S]*?)(?=\n##|$)/i;
+    const positionMatch = content.match(positionPattern);
+    if (positionMatch) {
+      const header = positionMatch[1];
+      let posBody = positionMatch[2];
+
+      // Update or insert Phase line
+      const newPhase = `Phase: ${phaseNumber}${phaseName ? ` (${phaseName})` : ''} — EXECUTING`;
+      if (/^Phase:/m.test(posBody)) {
+        posBody = posBody.replace(/^Phase:.*$/m, newPhase);
+      } else {
+        posBody = newPhase + '\n' + posBody;
+      }
+
+      // Update or insert Plan line
+      const newPlan = `Plan: 1 of ${planCount || '?'}`;
+      if (/^Plan:/m.test(posBody)) {
+        posBody = posBody.replace(/^Plan:.*$/m, newPlan);
+      } else {
+        posBody = posBody.replace(/^(Phase:.*$)/m, `$1\n${newPlan}`);
+      }
+
+      // Update Status line if present
+      const newStatus = `Status: Executing Phase ${phaseNumber}`;
+      if (/^Status:/m.test(posBody)) {
+        posBody = posBody.replace(/^Status:.*$/m, newStatus);
+      }
+
+      // Update Last activity line if present
+      const newActivity = `Last activity: ${today} -- Phase ${phaseNumber} execution started`;
+      if (/^Last activity:/im.test(posBody)) {
+        posBody = posBody.replace(/^Last activity:.*$/im, newActivity);
+      }
+
+      content = content.replace(positionPattern, `${header}${posBody}`);
+      updated.push('Current Position');
    }

-    content = content.replace(positionPattern, `${header}${posBody}`);
-    updated.push('Current Position');
-  }
-
-  if (updated.length > 0) {
-    writeStateMd(statePath, content, cwd);
-  }
+    return content;
+  }, cwd);

  output({ updated, phase: phaseNumber, phase_name: phaseName || null, plan_count: planCount || null }, raw, updated.length > 0 ? 'true' : 'false');
 }
@@ -1330,6 +1391,7 @@ module.exports = {
  stateReplaceField,
  stateReplaceFieldWithFallback,
  writeStateMd,
+  readModifyWriteStateMd,
  updatePerformanceMetricsSection,
  cmdStateLoad,
  cmdStateGet,
--- a/get-shit-done/bin/lib/template.cjs
+++ b/get-shit-done/bin/lib/template.cjs
@@ -4,7 +4,7 @@

 const fs = require('fs');
 const path = require('path');
-const { normalizePhaseName, findPhaseInternal, generateSlugInternal, normalizeMd, toPosixPath, output, error } = require('./core.cjs');
+const { normalizePhaseName, findPhaseInternal, generateSlugInternal, normalizeMd, toPosixPath, planningDir, output, error } = require('./core.cjs');
 const { reconstructFrontmatter } = require('./frontmatter.cjs');

 function cmdTemplateSelect(cwd, planPath, raw) {
@@ -131,6 +131,10 @@ function cmdTemplateFill(cwd, templateType, options, raw) {
        must_haves: { truths: [], artifacts: [], key_links: [] },
        ...fields,
      };
+      const planBase = planningDir(cwd);
+      const projectRef = toPosixPath(path.relative(cwd, path.join(planBase, 'PROJECT.md')));
+      const roadmapRef = toPosixPath(path.relative(cwd, path.join(planBase, 'ROADMAP.md')));
+      const stateRef = toPosixPath(path.relative(cwd, path.join(planBase, 'STATE.md')));
      body = [
        `# Phase ${options.phase} Plan ${planNum}: [Title]`,
        '',
@@ -140,9 +144,9 @@ function cmdTemplateFill(cwd, templateType, options, raw) {
        '- **Output:** [Concrete deliverable]',
        '',
        '## Context',
-        '@.planning/PROJECT.md',
-        '@.planning/ROADMAP.md',
-        '@.planning/STATE.md',
+        `@${projectRef}`,
+        `@${roadmapRef}`,
+        `@${stateRef}`,
        '',
        '## Tasks',
        '',
--- a/get-shit-done/bin/lib/verify.cjs
+++ b/get-shit-done/bin/lib/verify.cjs
@@ -5,7 +5,7 @@
 const fs = require('fs');
 const path = require('path');
 const os = require('os');
-const { safeReadFile, loadConfig, normalizePhaseName, escapeRegex, execGit, findPhaseInternal, getMilestoneInfo, stripShippedMilestones, extractCurrentMilestone, planningDir, planningRoot, output, error, checkAgentsInstalled, CONFIG_DEFAULTS } = require('./core.cjs');
+const { safeReadFile, loadConfig, normalizePhaseName, escapeRegex, execGit, findPhaseInternal, getMilestoneInfo, stripShippedMilestones, extractCurrentMilestone, planningDir, output, error, checkAgentsInstalled, CONFIG_DEFAULTS } = require('./core.cjs');
 const { extractFrontmatter, parseMustHavesBlock } = require('./frontmatter.cjs');
 const { writeStateMd } = require('./state.cjs');

@@ -534,11 +534,10 @@ function cmdValidateHealth(cwd, options, raw) {
  }

  const planBase = planningDir(cwd);
-  const planRoot = planningRoot(cwd);
-  const projectPath = path.join(planRoot, 'PROJECT.md');
+  const projectPath = path.join(planBase, 'PROJECT.md');
  const roadmapPath = path.join(planBase, 'ROADMAP.md');
  const statePath = path.join(planBase, 'STATE.md');
-  const configPath = path.join(planRoot, 'config.json');
+  const configPath = path.join(planBase, 'config.json');
  const phasesDir = path.join(planBase, 'phases');

  const errors = [];
@@ -649,6 +648,10 @@ function cmdValidateHealth(cwd, options, raw) {
        addIssue('warning', 'W008', 'config.json: workflow.nyquist_validation absent (defaults to enabled but agents may skip)', 'Run /gsd-health --repair to add key', true);
        if (!repairs.includes('addNyquistKey')) repairs.push('addNyquistKey');
      }
+      if (configParsed.workflow && configParsed.workflow.ai_integration_phase === undefined) {
+        addIssue('warning', 'W016', 'config.json: workflow.ai_integration_phase absent (defaults to enabled — run /gsd-ai-integration-phase before planning AI system phases)', 'Run /gsd-health --repair to add key', true);
+        if (!repairs.includes('addAiIntegrationPhaseKey')) repairs.push('addAiIntegrationPhaseKey');
+      }
    } catch { /* intentionally empty */ }
  }

@@ -740,10 +743,24 @@ function cmdValidateHealth(cwd, options, raw) {
      }
    } catch { /* intentionally empty */ }

+    // Build a set of phases explicitly marked not-yet-started in the ROADMAP
+    // summary list (- [ ] **Phase N:**). These phases are intentionally absent
+    // from disk -- W006 must not fire for them (#2009).
+    const notStartedPhases = new Set();
+    const uncheckedPattern = /-\s*\[\s\]\s*\*{0,2}Phase\s+(\d+[A-Z]?(?:\.\d+)*)[:\s*]/gi;
+    let um;
+    while ((um = uncheckedPattern.exec(roadmapContent)) !== null) {
+      notStartedPhases.add(um[1]);
+      // Also add zero-padded variant so 1 and 01 both match
+      notStartedPhases.add(String(parseInt(um[1], 10)).padStart(2, '0'));
+    }
+
    // Phases in ROADMAP but not on disk
    for (const p of roadmapPhases) {
      const padded = String(parseInt(p, 10)).padStart(2, '0');
      if (!diskPhases.has(p) && !diskPhases.has(padded)) {
+        // Skip phases explicitly flagged as not-yet-started in the summary list
+        if (notStartedPhases.has(p) || notStartedPhases.has(padded)) continue;
        addIssue('warning', 'W006', `Phase ${p} in ROADMAP.md but no directory on disk`, 'Create phase directory or remove from roadmap');
      }
    }
@@ -861,9 +878,12 @@ function cmdValidateHealth(cwd, options, raw) {
            }
            // Generate minimal STATE.md from ROADMAP.md structure
            const milestone = getMilestoneInfo(cwd);
+            const projectRef = path
+              .relative(cwd, path.join(planningDir(cwd), 'PROJECT.md'))
+              .split(path.sep).join('/');
            let stateContent = `# Session State\n\n`;
            stateContent += `## Project Reference\n\n`;
-            stateContent += `See: .planning/PROJECT.md\n\n`;
+            stateContent += `See: ${projectRef}\n\n`;
            stateContent += `## Position\n\n`;
            stateContent += `**Milestone:** ${milestone.version} ${milestone.name}\n`;
            stateContent += `**Current phase:** (determining...)\n`;
@@ -891,6 +911,23 @@ function cmdValidateHealth(cwd, options, raw) {
            }
            break;
          }
+          case 'addAiIntegrationPhaseKey': {
+            if (fs.existsSync(configPath)) {
+              try {
+                const configRaw = fs.readFileSync(configPath, 'utf-8');
+                const configParsed = JSON.parse(configRaw);
+                if (!configParsed.workflow) configParsed.workflow = {};
+                if (configParsed.workflow.ai_integration_phase === undefined) {
+                  configParsed.workflow.ai_integration_phase = true;
+                  fs.writeFileSync(configPath, JSON.stringify(configParsed, null, 2), 'utf-8');
+                }
+                repairActions.push({ action: repair, success: true, path: 'config.json' });
+              } catch (err) {
+                repairActions.push({ action: repair, success: false, error: err.message });
+              }
+            }
+            break;
+          }
        }
      } catch (err) {
        repairActions.push({ action: repair, success: false, error: err.message });
--- a/get-shit-done/references/ai-evals.md
+++ b/get-shit-done/references/ai-evals.md
@@ -0,0 +1,156 @@
+# AI Evaluation Reference
+
+> Reference used by `gsd-eval-planner` and `gsd-eval-auditor`.
+> Based on "AI Evals for Everyone" course (Reganti & Badam) + industry practice.
+
+---
+
+## Core Concepts
+
+### Why Evals Exist
+AI systems are non-deterministic. Input X does not reliably produce output Y across runs, users, or edge cases. Evals are the continuous process of assessing whether your system's behavior meets expectations under real-world conditions — unit tests and integration tests alone are insufficient.
+
+### Model vs. Product Evaluation
+- **Model evals** (MMLU, HumanEval, GSM8K) — measure general capability in standardized conditions. Use as initial filter only.
+- **Product evals** — measure behavior inside your specific system, with your data, your users, your domain rules. This is where 80% of eval effort belongs.
+
+### The Three Components of Every Eval
+- **Input** — everything affecting the system: query, history, retrieved docs, system prompt, config
+- **Expected** — what good behavior looks like, defined through rubrics
+- **Actual** — what the system produced, including intermediate steps, tool calls, and reasoning traces
+
+### Three Measurement Approaches
+1. **Code-based metrics** — deterministic checks: JSON validation, required disclaimers, performance thresholds, classification flags. Fast, cheap, reliable. Use first.
+2. **LLM judges** — one model evaluates another against a rubric. Powerful for subjective qualities (tone, reasoning, escalation). Requires calibration against human judgment before trusting.
+3. **Human evaluation** — gold standard for nuanced judgment. Doesn't scale. Use for calibration, edge cases, periodic sampling, and high-stakes decisions.
+
+Most effective systems combine all three.
+
+---
+
+## Evaluation Dimensions
+
+### Pre-Deployment (Development Phase)
+
+| Dimension | What It Measures | When It Matters |
+|-----------|-----------------|-----------------|
+| **Factual accuracy** | Correctness of claims against ground truth | RAG, knowledge bases, any factual assertions |
+| **Context faithfulness** | Response grounded in provided context vs. fabricated | RAG pipelines, document Q&A, retrieval-augmented systems |
+| **Hallucination detection** | Plausible but unsupported claims | All generative systems, high-stakes domains |
+| **Escalation accuracy** | Correct identification of when human intervention needed | Customer service, healthcare, financial advisory |
+| **Policy compliance** | Adherence to business rules, legal requirements, disclaimers | Regulated industries, enterprise deployments |
+| **Tone/style appropriateness** | Match with brand voice, audience expectations, emotional context | Customer-facing systems, content generation |
+| **Output structure validity** | Schema compliance, required fields, format correctness | Structured extraction, API integrations, data pipelines |
+| **Task completion** | Whether the system accomplished the stated goal | Agentic workflows, multi-step tasks |
+| **Tool use correctness** | Correct selection and invocation of tools | Agent systems with tool calls |
+| **Safety** | Absence of harmful, biased, or inappropriate outputs | All user-facing systems |
+
+### Production Monitoring
+
+| Dimension | Monitoring Approach |
+|-----------|---------------------|
+| **Safety violations** | Online guardrail — real-time, immediate intervention |
+| **Compliance failures** | Online guardrail — block or escalate before user sees output |
+| **Quality degradation trends** | Offline flywheel — batch analysis of sampled interactions |
+| **Emerging failure modes** | Signal-metric divergence — when user behavior signals diverge from metric scores, investigate manually |
+| **Cost/latency drift** | Code-based metrics — automated threshold alerts |
+
+---
+
+## The Guardrail vs. Flywheel Decision
+
+Ask: "If this behavior goes wrong, would it be catastrophic for my business?"
+
+- **Yes → Guardrail** — run online, real-time, with immediate intervention (block, escalate, hand off). Be selective: guardrails add latency.
+- **No → Flywheel** — run offline as batch analysis feeding system refinements over time.
+
+---
+
+## Rubric Design
+
+Generic metrics are meaningless without context. "Helpfulness" in real estate means summarizing listings clearly. In healthcare it means knowing when *not* to answer.
+
+A rubric must define:
+1. The dimension being measured
+2. What scores 1, 3, and 5 on a 5-point scale (or pass/fail criteria)
+3. Domain-specific examples of acceptable vs. unacceptable behavior
+
+Without rubrics, LLM judges produce noise rather than signal.
+
+---
+
+## Reference Dataset Guidelines
+
+- Start with **10-20 high-quality examples** — not 200 mediocre ones
+- Cover: critical success scenarios, common user workflows, known edge cases, historical failure modes
+- Have domain experts label the examples (not just engineers)
+- Expand based on what you learn in production — don't build for hypothetical coverage
+
+---
+
+## Eval Tooling Guide
+
+| Tool | Type | Best For | Key Strength |
+|------|------|----------|-------------|
+| **RAGAS** | Python library | RAG evaluation | Purpose-built metrics: faithfulness, answer relevance, context precision/recall |
+| **Langfuse** | Platform (open-source, self-hostable) | All system types | Strong tracing, prompt management, good for teams wanting infrastructure control |
+| **LangSmith** | Platform (commercial) | LangChain/LangGraph ecosystems | Tightest integration with LangChain; best if already in that ecosystem |
+| **Arize Phoenix** | Platform (open-source + hosted) | RAG + multi-agent tracing | Strong RAG eval + trace visualization; open-source with hosted option |
+| **Braintrust** | Platform (commercial) | Model-agnostic evaluation | Dataset and experiment management; good for comparing across frameworks |
+| **Promptfoo** | CLI tool (open-source) | Prompt testing, CI/CD | CLI-first, excellent for CI/CD prompt regression testing |
+
+### Tool Selection by System Type
+
+| System Type | Recommended Tooling |
+|-------------|---------------------|
+| RAG / Knowledge Q&A | RAGAS + Arize Phoenix or Braintrust |
+| Multi-agent systems | Langfuse + Arize Phoenix |
+| Conversational / single-model | Promptfoo + Braintrust |
+| Structured extraction | Promptfoo + code-based validators |
+| LangChain/LangGraph projects | LangSmith (native integration) |
+| Production monitoring (all types) | Langfuse, Arize Phoenix, or LangSmith |
+
+---
+
+## Evals in the Development Lifecycle
+
+### Plan Phase (Evaluation-Aware Design)
+Before writing code, define:
+1. What type of AI system is being built → determines framework and dominant eval concerns
+2. Critical failure modes (3-5 behaviors that cannot go wrong)
+3. Rubrics — explicit definitions of acceptable/unacceptable behavior per dimension
+4. Evaluation strategy — which dimensions use code metrics, LLM judges, or human review
+5. Reference dataset requirements — size, composition, labeling approach
+6. Eval tooling selection
+
+Output: EVALS-SPEC section of AI-SPEC.md
+
+### Execute Phase (Instrument While Building)
+- Add tracing from day one (Langfuse, Arize Phoenix, or LangSmith)
+- Build reference dataset concurrently with implementation
+- Implement code-based checks first; add LLM judges only for subjective dimensions
+- Run evals in CI/CD via Promptfoo or Braintrust
+
+### Verify Phase (Pre-Deployment Validation)
+- Run full reference dataset against all metrics
+- Conduct human review of edge cases and LLM judge disagreements
+- Calibrate LLM judges against human scores (target ≥ 0.7 correlation before trusting)
+- Define and configure production guardrails
+- Establish monitoring baseline
+
+### Monitor Phase (Production Evaluation Loop)
+- Smart sampling — weight toward interactions with concerning signals (retries, unusual length, explicit escalations)
+- Online guardrails on every interaction
+- Offline flywheel on sampled batch
+- Watch for signal-metric divergence — the early warning system for evaluation gaps
+
+---
+
+## Common Pitfalls
+
+1. **Assuming benchmarks predict product success** — they don't; model evals are a filter, not a verdict
+2. **Engineering evals in isolation** — domain experts must co-define rubrics; engineers alone miss critical nuances
+3. **Building comprehensive coverage on day one** — start small (10-20 examples), expand from real failure modes
+4. **Trusting uncalibrated LLM judges** — validate against human judgment before relying on them
+5. **Measuring everything** — only track metrics that drive decisions; "collect it all" produces noise
+6. **Treating evaluation as one-time setup** — user behavior evolves, requirements change, failure modes emerge; evaluation is continuous
--- a/get-shit-done/references/ai-frameworks.md
+++ b/get-shit-done/references/ai-frameworks.md
@@ -0,0 +1,186 @@
+# AI Framework Decision Matrix
+
+> Reference used by `gsd-framework-selector` and `gsd-ai-researcher`.
+> Distilled from official docs, benchmarks, and developer reports (2026).
+
+---
+
+## Quick Picks
+
+| Situation | Pick |
+|-----------|------|
+| Simplest path to a working agent (OpenAI) | OpenAI Agents SDK |
+| Simplest path to a working agent (model-agnostic) | CrewAI |
+| Production RAG / document Q&A | LlamaIndex |
+| Complex stateful workflows with branching | LangGraph |
+| Multi-agent teams with defined roles | CrewAI |
+| Code-aware autonomous agents (Anthropic) | Claude Agent SDK |
+| "I don't know my requirements yet" | LangChain |
+| Regulated / audit-trail required | LangGraph |
+| Enterprise Microsoft/.NET shops | AutoGen/AG2 |
+| Google Cloud / Gemini-committed teams | Google ADK |
+| Pure NLP pipelines with explicit control | Haystack |
+
+---
+
+## Framework Profiles
+
+### CrewAI
+- **Type:** Multi-agent orchestration
+- **Language:** Python only
+- **Model support:** Model-agnostic
+- **Learning curve:** Beginner (role/task/crew maps to real teams)
+- **Best for:** Content pipelines, research automation, business process workflows, rapid prototyping
+- **Avoid if:** Fine-grained state management, TypeScript, fault-tolerant checkpointing, complex conditional branching
+- **Strengths:** Fastest multi-agent prototyping, 5.76x faster than LangGraph on QA tasks, built-in memory (short/long/entity/contextual), Flows architecture, standalone (no LangChain dep)
+- **Weaknesses:** Limited checkpointing, coarse error handling, Python only
+- **Eval concerns:** Task decomposition accuracy, inter-agent handoff, goal completion rate, loop detection
+
+### LlamaIndex
+- **Type:** RAG and data ingestion
+- **Language:** Python + TypeScript
+- **Model support:** Model-agnostic
+- **Learning curve:** Intermediate
+- **Best for:** Legal research, internal knowledge assistants, enterprise document search, any system where retrieval quality is the #1 priority
+- **Avoid if:** Primary need is agent orchestration, multi-agent collaboration, or chatbot conversation flow
+- **Strengths:** Best-in-class document parsing (LlamaParse), 35% retrieval accuracy improvement, 20-30% faster queries, mixed retrieval strategies (vector + graph + reranker)
+- **Weaknesses:** Data framework first — agent orchestration is secondary
+- **Eval concerns:** Context faithfulness, hallucination, answer relevance, retrieval precision/recall
+
+### LangChain
+- **Type:** General-purpose LLM framework
+- **Language:** Python + TypeScript
+- **Model support:** Model-agnostic (widest ecosystem)
+- **Learning curve:** Intermediate–Advanced
+- **Best for:** Evolving requirements, many third-party integrations, teams wanting one framework for everything, RAG + agents + chains
+- **Avoid if:** Simple well-defined use case, RAG-primary (use LlamaIndex), complex stateful workflows (use LangGraph), performance at scale is critical
+- **Strengths:** Largest community and integration ecosystem, 25% faster development vs scratch, covers RAG/agents/chains/memory
+- **Weaknesses:** Abstraction overhead, p99 latency degrades under load, complexity creep risk
+- **Eval concerns:** End-to-end task completion, chain correctness, retrieval quality
+
+### LangGraph
+- **Type:** Stateful agent workflows (graph-based)
+- **Language:** Python + TypeScript (full parity)
+- **Model support:** Model-agnostic (inherits LangChain integrations)
+- **Learning curve:** Intermediate–Advanced (graph mental model)
+- **Best for:** Production-grade stateful workflows, regulated industries, audit trails, human-in-the-loop flows, fault-tolerant multi-step agents
+- **Avoid if:** Simple chatbot, purely linear workflow, rapid prototyping
+- **Strengths:** Best checkpointing (every node), time-travel debugging, native Postgres/Redis persistence, streaming support, chosen by 62% of developers for stateful agent work (2026)
+- **Weaknesses:** More upfront scaffolding, steeper curve, overkill for simple cases
+- **Eval concerns:** State transition correctness, goal completion rate, tool use accuracy, safety guardrails
+
+### OpenAI Agents SDK
+- **Type:** Native OpenAI agent framework
+- **Language:** Python + TypeScript
+- **Model support:** Optimized for OpenAI (supports 100+ via Chat Completions compatibility)
+- **Learning curve:** Beginner (4 primitives: Agents, Handoffs, Guardrails, Tracing)
+- **Best for:** OpenAI-committed teams, rapid agent prototyping, voice agents (gpt-realtime), teams wanting visual builder (AgentKit)
+- **Avoid if:** Model flexibility needed, complex multi-agent collaboration, persistent state management required, vendor lock-in concern
+- **Strengths:** Simplest mental model, built-in tracing and guardrails, Handoffs for agent delegation, Realtime Agents for voice
+- **Weaknesses:** OpenAI vendor lock-in, no built-in persistent state, younger ecosystem
+- **Eval concerns:** Instruction following, safety guardrails, escalation accuracy, tone consistency
+
+### Claude Agent SDK (Anthropic)
+- **Type:** Code-aware autonomous agent framework
+- **Language:** Python + TypeScript
+- **Model support:** Claude models only
+- **Learning curve:** Intermediate (18 hook events, MCP, tool decorators)
+- **Best for:** Developer tooling, code generation/review agents, autonomous coding assistants, MCP-heavy architectures, safety-critical applications
+- **Avoid if:** Model flexibility needed, stable/mature API required, use case unrelated to code/tool-use
+- **Strengths:** Deepest MCP integration, built-in filesystem/shell access, 18 lifecycle hooks, automatic context compaction, extended thinking, safety-first design
+- **Weaknesses:** Claude-only vendor lock-in, newer/evolving API, smaller community
+- **Eval concerns:** Tool use correctness, safety, code quality, instruction following
+
+### AutoGen / AG2 / Microsoft Agent Framework
+- **Type:** Multi-agent conversational framework
+- **Language:** Python (AG2), Python + .NET (Microsoft Agent Framework)
+- **Model support:** Model-agnostic
+- **Learning curve:** Intermediate–Advanced
+- **Best for:** Research applications, conversational problem-solving, code generation + execution loops, Microsoft/.NET shops
+- **Avoid if:** You want ecosystem stability, deterministic workflows, or "safest long-term bet" (fragmentation risk)
+- **Strengths:** Most sophisticated conversational agent patterns, code generation + execution loop, async event-driven (v0.4+), cross-language interop (Microsoft Agent Framework)
+- **Weaknesses:** Ecosystem fragmented (AutoGen maintenance mode, AG2 fork, Microsoft Agent Framework preview) — genuine long-term risk
+- **Eval concerns:** Conversation goal completion, consensus quality, code execution correctness
+
+### Google ADK (Agent Development Kit)
+- **Type:** Multi-agent orchestration framework
+- **Language:** Python + Java
+- **Model support:** Optimized for Gemini; supports other models via LiteLLM
+- **Learning curve:** Intermediate (agent/tool/session model, familiar if you know LangGraph)
+- **Best for:** Google Cloud / Vertex AI shops, multi-agent workflows needing built-in session management and memory, teams already committed to Gemini, agent pipelines that need Google Search / BigQuery tool integration
+- **Avoid if:** Model flexibility is required beyond Gemini, no Google Cloud dependency acceptable, TypeScript-only stack
+- **Strengths:** First-party Google support, built-in session/memory/artifact management, tight Vertex AI and Google Search integration, own eval framework (RAGAS-compatible), multi-agent by design (sequential, parallel, loop patterns), Java SDK for enterprise teams
+- **Weaknesses:** Gemini vendor lock-in in practice, younger community than LangChain/LlamaIndex, less third-party integration depth
+- **Eval concerns:** Multi-agent task decomposition, tool use correctness, session state consistency, goal completion rate
+
+### Haystack
+- **Type:** NLP pipeline framework
+- **Language:** Python
+- **Model support:** Model-agnostic
+- **Learning curve:** Intermediate
+- **Best for:** Explicit, auditable NLP pipelines, document processing with fine-grained control, enterprise search, regulated industries needing transparency
+- **Avoid if:** Rapid prototyping, multi-agent workflows, or you want a large community
+- **Strengths:** Explicit pipeline control, strong for structured data pipelines, good documentation
+- **Weaknesses:** Smaller community, less agent-oriented than alternatives
+- **Eval concerns:** Extraction accuracy, pipeline output validity, retrieval quality
+
+---
+
+## Decision Dimensions
+
+### By System Type
+
+| System Type | Primary Framework(s) | Key Eval Concerns |
+|-------------|---------------------|-------------------|
+| RAG / Knowledge Q&A | LlamaIndex, LangChain | Context faithfulness, hallucination, retrieval precision/recall |
+| Multi-agent orchestration | CrewAI, LangGraph, Google ADK | Task decomposition, handoff quality, goal completion |
+| Conversational assistants | OpenAI Agents SDK, Claude Agent SDK | Tone, safety, instruction following, escalation |
+| Structured data extraction | LangChain, LlamaIndex | Schema compliance, extraction accuracy |
+| Autonomous task agents | LangGraph, OpenAI Agents SDK | Safety guardrails, tool correctness, cost adherence |
+| Content generation | Claude Agent SDK, OpenAI Agents SDK | Brand voice, factual accuracy, tone |
+| Code automation | Claude Agent SDK | Code correctness, safety, test pass rate |
+
+### By Team Size and Stage
+
+| Context | Recommendation |
+|---------|----------------|
+| Solo dev, prototyping | OpenAI Agents SDK or CrewAI (fastest to running) |
+| Solo dev, RAG | LlamaIndex (batteries included) |
+| Team, production, stateful | LangGraph (best fault tolerance) |
+| Team, evolving requirements | LangChain (broadest escape hatches) |
+| Team, multi-agent | CrewAI (simplest role abstraction) |
+| Enterprise, .NET | AutoGen AG2 / Microsoft Agent Framework |
+
+### By Model Commitment
+
+| Preference | Framework |
+|-----------|-----------|
+| OpenAI-only | OpenAI Agents SDK |
+| Anthropic/Claude-only | Claude Agent SDK |
+| Google/Gemini-committed | Google ADK |
+| Model-agnostic (full flexibility) | LangChain, LlamaIndex, CrewAI, LangGraph, Haystack |
+
+---
+
+## Anti-Patterns
+
+1. **Using LangChain for simple chatbots** — Direct SDK call is less code, faster, and easier to debug
+2. **Using CrewAI for complex stateful workflows** — Checkpointing gaps will bite you in production
+3. **Using OpenAI Agents SDK with non-OpenAI models** — Loses the integration benefits you chose it for
+4. **Using LlamaIndex as a multi-agent framework** — It can do agents, but that's not its strength
+5. **Defaulting to LangChain without evaluating alternatives** — "Everyone uses it" ≠ right for your use case
+6. **Starting a new project on AutoGen (not AG2)** — AutoGen is in maintenance mode; use AG2 or wait for Microsoft Agent Framework GA
+7. **Choosing LangGraph for simple linear flows** — The graph overhead is not worth it; use LangChain chains instead
+8. **Ignoring vendor lock-in** — Provider-native SDKs (OpenAI, Claude) trade flexibility for integration depth; decide consciously
+
+---
+
+## Combination Plays (Multi-Framework Stacks)
+
+| Production Pattern | Stack |
+|-------------------|-------|
+| RAG with observability | LlamaIndex + LangSmith or Langfuse |
+| Stateful agent with RAG | LangGraph + LlamaIndex |
+| Multi-agent with tracing | CrewAI + Langfuse |
+| OpenAI agents with evals | OpenAI Agents SDK + Promptfoo or Braintrust |
+| Claude agents with MCP | Claude Agent SDK + LangSmith or Arize Phoenix |
--- a/get-shit-done/references/common-bug-patterns.md
+++ b/get-shit-done/references/common-bug-patterns.md
@@ -6,85 +6,85 @@ Checklist of frequent bug patterns to scan before forming hypotheses. Ordered by

 ## Null / Undefined Access

- [ ] Accessing property on `null` or `undefined` — missing null check or optional chaining
- [ ] Function returns `undefined` instead of expected value — missing `return` statement or wrong branch
- [ ] Array/object destructuring on `null`/`undefined` — API returned error shape instead of data
- [ ] Optional parameter used without default — caller omitted argument
+- **Null property access** — accessing property on `null` or `undefined`, missing null check or optional chaining
+- **Missing return value** — function returns `undefined` instead of expected value, missing `return` statement or wrong branch
+- **Destructuring null** — array/object destructuring on `null`/`undefined`, API returned error shape instead of data
+- **Undefaulted optional** — optional parameter used without default, caller omitted argument

 ## Off-by-One / Boundary

- [ ] Loop starts at 1 instead of 0, or ends at `length` instead of `length - 1`
- [ ] Fence-post error — "N items need N-1 separators" miscounted
- [ ] Inclusive vs exclusive range boundary — `<` vs `<=`, slice/substring end index
- [ ] Empty collection not handled — `.length === 0` falls through to logic assuming items exist
+- **Wrong loop bound** — loop starts at 1 instead of 0, or ends at `length` instead of `length - 1`
+- **Fence-post error** — "N items need N-1 separators" miscounted
+- **Inclusive vs exclusive** — range boundary `<` vs `<=`, slice/substring end index
+- **Empty collection** — `.length === 0` falls through to logic assuming items exist

 ## Async / Timing

- [ ] Missing `await` on async function — gets Promise object instead of resolved value
- [ ] Race condition — two async operations read/write same state without coordination
- [ ] Stale closure — callback captures old variable value, not current one
- [ ] Event handler fires before setup complete — initialization order dependency
- [ ] Timeout/interval not cleaned up — fires after component/context destroyed
+- **Missing await** — async function called without `await`, gets Promise object instead of resolved value
+- **Race condition** — two async operations read/write same state without coordination
+- **Stale closure** — callback captures old variable value, not current one
+- **Initialization order** — event handler fires before setup complete
+- **Leaked timer** — timeout/interval not cleaned up, fires after component/context destroyed

 ## State Management

- [ ] Mutating shared state — object/array modified in place affects other consumers
- [ ] State updated but UI not re-rendered — missing reactive trigger or wrong reference
- [ ] Stale state in event handler — closure captures state at bind time, not current value
- [ ] Multiple sources of truth — same data stored in two places, one gets out of sync
- [ ] State machine allows invalid transition — missing guard condition
+- **Shared mutation** — object/array modified in place affects other consumers
+- **Stale render** — state updated but UI not re-rendered, missing reactive trigger or wrong reference
+- **Stale handler state** — closure captures state at bind time, not current value
+- **Dual source of truth** — same data stored in two places, one gets out of sync
+- **Invalid transition** — state machine allows transition missing guard condition

 ## Import / Module

- [ ] Circular dependency — module A imports B, B imports A, one gets `undefined`
- [ ] Default vs named export mismatch — `import X` vs `import { X }`
- [ ] Wrong file extension — `.js` vs `.cjs` vs `.mjs`, `.ts` vs `.tsx`
- [ ] Path case sensitivity — works on Windows/macOS, fails on Linux
- [ ] Missing file extension in import — ESM requires explicit extensions
+- **Circular dependency** — module A imports B, B imports A, one gets `undefined`
+- **Export mismatch** — default vs named export, `import X` vs `import { X }`
+- **Wrong extension** — `.js` vs `.cjs` vs `.mjs`, `.ts` vs `.tsx`
+- **Path case sensitivity** — works on Windows/macOS, fails on Linux
+- **Missing extension** — ESM requires explicit file extensions in imports

 ## Type / Coercion

- [ ] String vs number comparison — `"5" > "10"` is `true` (lexicographic), `5 > 10` is `false`
- [ ] Implicit type coercion — `==` instead of `===`, truthy/falsy surprises (`0`, `""`, `[]`)
- [ ] Integer overflow or floating point — `0.1 + 0.2 !== 0.3`, large numbers lose precision
- [ ] Boolean vs truthy check — value is `0` or `""` which is valid but falsy
+- **String vs number compare** — `"5" > "10"` is `true` (lexicographic), `5 > 10` is `false`
+- **Implicit coercion** — `==` instead of `===`, truthy/falsy surprises (`0`, `""`, `[]`)
+- **Numeric precision** — `0.1 + 0.2 !== 0.3`, large integers lose precision
+- **Falsy valid value** — value is `0` or `""` which is valid but falsy

 ## Environment / Config

- [ ] Environment variable missing or wrong — different value in dev vs prod vs CI
- [ ] Hardcoded path or URL — works on one machine, fails on another
- [ ] Port already in use — previous process still running
- [ ] File permission denied — different user/group in deployment
- [ ] Missing dependency — not in package.json or not installed
+- **Missing env var** — environment variable missing or wrong value in dev vs prod vs CI
+- **Hardcoded path** — works on one machine, fails on another
+- **Port conflict** — port already in use, previous process still running
+- **Permission denied** — different user/group in deployment
+- **Missing dependency** — not in package.json or not installed

 ## Data Shape / API Contract

- [ ] API response shape changed — backend updated, frontend expects old format
- [ ] Array where object expected (or vice versa) — `data` vs `data.results` vs `data[0]`
- [ ] Missing field in payload — required field omitted, backend returns validation error
- [ ] Date/time format mismatch — ISO string vs timestamp vs locale string
- [ ] Encoding mismatch — UTF-8 vs Latin-1, URL encoding, HTML entities
+- **Changed response shape** — backend updated, frontend expects old format
+- **Wrong container type** — array where object expected or vice versa, `data` vs `data.results` vs `data[0]`
+- **Missing required field** — required field omitted in payload, backend returns validation error
+- **Date format mismatch** — ISO string vs timestamp vs locale string
+- **Encoding mismatch** — UTF-8 vs Latin-1, URL encoding, HTML entities

 ## Regex / String

- [ ] Regex `g` flag with `.test()` then `.exec()` — `lastIndex` not reset between calls
- [ ] Missing escape — `.` matches any char, `$` is special, backslash needs doubling
- [ ] Greedy match captures too much — `.*` eats through delimiters, need `.*?`
- [ ] String interpolation in wrong quote type — template literals need backticks
+- **Sticky lastIndex** — regex `g` flag with `.test()` then `.exec()`, `lastIndex` not reset between calls
+- **Missing escape** — `.` matches any char, `$` is special, backslash needs doubling
+- **Greedy overmatch** — `.*` eats through delimiters, need `.*?`
+- **Wrong quote type** — string interpolation needs backticks for template literals

 ## Error Handling

- [ ] Catch block swallows error — empty `catch {}` or logs but doesn't rethrow/handle
- [ ] Wrong error type caught — catches base `Error` when specific type needed
- [ ] Error in error handler — cleanup code throws, masking original error
- [ ] Promise rejection unhandled — missing `.catch()` or try/catch around `await`
+- **Swallowed error** — empty `catch {}` or logs but doesn't rethrow/handle
+- **Wrong error type** — catches base `Error` when specific type needed
+- **Error in handler** — cleanup code throws, masking original error
+- **Unhandled rejection** — missing `.catch()` or try/catch around `await`

 ## Scope / Closure

- [ ] Variable shadowing — inner scope declares same name, hides outer variable
- [ ] Loop variable capture — all closures share same `var i`, use `let` or bind
- [ ] `this` binding lost — callback loses context, need `.bind()` or arrow function
- [ ] Block scope vs function scope — `var` hoisted to function, `let`/`const` block-scoped
+- **Variable shadowing** — inner scope declares same name, hides outer variable
+- **Loop variable capture** — all closures share same `var i`, use `let` or bind
+- **Lost this binding** — callback loses context, need `.bind()` or arrow function
+- **Scope confusion** — `var` hoisted to function, `let`/`const` block-scoped

 </patterns>

--- a/get-shit-done/references/gates.md
+++ b/get-shit-done/references/gates.md
@@ -0,0 +1,70 @@
+# Gates Taxonomy
+
+Canonical gate types used across GSD workflows. Every validation checkpoint maps to one of these four types.
+
+---
+
+## Gate Types
+
+### Pre-flight Gate
+**Purpose:** Validates preconditions before starting an operation.
+**Behavior:** Blocks entry if conditions unmet. No partial work created.
+**Recovery:** Fix the missing precondition, then retry.
+**Examples:**
+- Plan-phase checks for REQUIREMENTS.md before planning
+- Execute-phase validates PLAN.md exists before execution
+- Discuss-phase confirms phase exists in ROADMAP.md
+
+### Revision Gate
+**Purpose:** Evaluates output quality and routes to revision if insufficient.
+**Behavior:** Loops back to producer with specific feedback. Bounded by iteration cap.
+**Recovery:** Producer addresses feedback; checker re-evaluates. The loop also escalates early if issue count does not decrease between consecutive iterations (stall detection). After max iterations, escalates unconditionally.
+**Examples:**
+- Plan-checker reviewing PLAN.md (max 3 iterations)
+- Verifier checking phase deliverables against success criteria
+
+### Escalation Gate
+**Purpose:** Surfaces unresolvable issues to the developer for a decision.
+**Behavior:** Pauses workflow, presents options, waits for human input.
+**Recovery:** Developer chooses action; workflow resumes on selected path.
+**Examples:**
+- Revision loop exhausted after 3 iterations
+- Merge conflict during worktree cleanup
+- Ambiguous requirement needing clarification
+
+### Abort Gate
+**Purpose:** Terminates the operation to prevent damage or waste.
+**Behavior:** Stops immediately, preserves state, reports reason.
+**Recovery:** Developer investigates root cause, fixes, restarts from checkpoint.
+**Examples:**
+- Context window critically low during execution
+- STATE.md in error state blocking /gsd-next
+- Verification finds critical missing deliverables
+
+---
+
+## Gate Matrix
+
+| Workflow | Phase | Gate Type | Artifacts Checked | Failure Behavior |
+|----------|-------|-----------|-------------------|------------------|
+| plan-phase | Entry | Pre-flight | REQUIREMENTS.md, ROADMAP.md | Block with missing-file message |
+| plan-phase | Step 12 | Revision | PLAN.md quality | Loop to planner (max 3) |
+| plan-phase | Post-revision | Escalation | Unresolved issues | Surface to developer |
+| execute-phase | Entry | Pre-flight | PLAN.md | Block with missing-plan message |
+| execute-phase | Completion | Revision | SUMMARY.md completeness | Re-run incomplete tasks |
+| verify-work | Entry | Pre-flight | SUMMARY.md | Block with missing-summary |
+| verify-work | Evaluation | Escalation | Failed criteria | Surface gaps to developer |
+| next | Entry | Abort | Error state, checkpoints | Stop with diagnostic |
+
+---
+
+## Implementing Gates
+
+Use this taxonomy when designing or auditing workflow validation points:
+
+- **Pre-flight** gates belong at workflow entry points. They are cheap, deterministic checks that prevent wasted work. If you can verify a precondition with a file-existence check or a config read, use a pre-flight gate.
+- **Revision** gates belong after a producer step where quality varies. Always pair them with an iteration cap to prevent infinite loops. The cap should reflect the cost of each iteration -- expensive operations get fewer retries.
+- **Escalation** gates belong wherever automated resolution is impossible or ambiguous. They are the safety valve between revision loops and abort. Present the developer with clear options and enough context to decide.
+- **Abort** gates belong at points where continuing would cause damage, waste significant resources, or produce meaningless output. They should preserve state so work can resume after the root cause is fixed.
+
+**Selection heuristic:** Start with pre-flight. If the check happens after work is produced, it is a revision gate. If the revision loop cannot resolve the issue, escalate. If continuing is dangerous, abort.
--- a/get-shit-done/references/ios-scaffold.md
+++ b/get-shit-done/references/ios-scaffold.md
@@ -0,0 +1,123 @@
+# iOS App Scaffold Reference
+
+Rules and patterns for scaffolding iOS applications. Apply when any plan involves creating a new iOS app target.
+
+---
+
+## Critical Rule: Never Use Package.swift as the Primary Build System for iOS Apps
+
+**NEVER use `Package.swift` with `.executableTarget` (or `.target`) to scaffold an iOS app.** Swift Package Manager executable targets compile as macOS command-line tools — they do not produce `.app` bundles, cannot be signed for iOS devices, and cannot be submitted to the App Store.
+
+**Prohibited pattern:**
+```swift
+// Package.swift — DO NOT USE for iOS apps
+.executableTarget(name: "MyApp", dependencies: [])
+// or
+.target(name: "MyApp", dependencies: [])
+```
+
+Using this pattern produces a macOS CLI binary, not an iOS app. The app will not build for any iOS simulator or device.
+
+---
+
+## Required Pattern: XcodeGen
+
+All iOS app scaffolding MUST use XcodeGen to generate the `.xcodeproj`.
+
+### Step 1 — Install XcodeGen (if not present)
+
+```bash
+brew install xcodegen
+```
+
+### Step 2 — Create `project.yml`
+
+`project.yml` is the XcodeGen spec that describes the project structure. Minimum viable spec:
+
+```yaml
+name: MyApp
+options:
+  bundleIdPrefix: com.example
+  deploymentTarget:
+    iOS: "17.0"
+settings:
+  SWIFT_VERSION: "5.10"
+  IPHONEOS_DEPLOYMENT_TARGET: "17.0"
+targets:
+  MyApp:
+    type: application
+    platform: iOS
+    sources: [Sources/MyApp]
+    settings:
+      PRODUCT_BUNDLE_IDENTIFIER: com.example.MyApp
+      INFOPLIST_FILE: Sources/MyApp/Info.plist
+    scheme:
+      testTargets:
+        - MyAppTests
+  MyAppTests:
+    type: bundle.unit-test
+    platform: iOS
+    sources: [Tests/MyAppTests]
+    dependencies:
+      - target: MyApp
+```
+
+### Step 3 — Generate the .xcodeproj
+
+```bash
+xcodegen generate
+```
+
+This creates `MyApp.xcodeproj` in the project root. Commit `project.yml` but add `*.xcodeproj` to `.gitignore` (regenerate on checkout).
+
+### Step 4 — Standard project layout
+
+```
+MyApp/
+├── project.yml              # XcodeGen spec — commit this
+├── .gitignore               # includes *.xcodeproj
+├── Sources/
+│   └── MyApp/
+│       ├── MyAppApp.swift   # @main entry point
+│       ├── ContentView.swift
+│       └── Info.plist
+└── Tests/
+    └── MyAppTests/
+        └── MyAppTests.swift
+```
+
+---
+
+## iOS Deployment Target Compatibility
+
+Always verify SwiftUI API availability against the project's `IPHONEOS_DEPLOYMENT_TARGET` before using any SwiftUI component.
+
+| API | Minimum iOS |
+|-----|-------------|
+| `NavigationView` | iOS 13 |
+| `NavigationStack` | iOS 16 |
+| `NavigationSplitView` | iOS 16 |
+| `List(selection:)` with multi-select | iOS 17 |
+| `ScrollView` scroll position APIs | iOS 17 |
+| `Observable` macro (`@Observable`) | iOS 17 |
+| `SwiftData` | iOS 17 |
+| `@Bindable` | iOS 17 |
+| `TipKit` | iOS 17 |
+
+**Rule:** If a plan requires a SwiftUI API that exceeds the project's deployment target, either:
+1. Raise the deployment target in `project.yml` (and document the decision), or
+2. Wrap the call in `if #available(iOS NN, *) { ... }` with a fallback implementation.
+
+Do NOT silently use an API that requires a higher iOS version than the declared deployment target — the app will crash at runtime on older devices.
+
+---
+
+## Verification
+
+After running `xcodegen generate`, verify the project builds:
+
+```bash
+xcodebuild -project MyApp.xcodeproj -scheme MyApp -destination 'platform=iOS Simulator,name=iPhone 16' build
+```
+
+A successful build (exit code 0) confirms the scaffold is valid for iOS.
--- a/get-shit-done/references/planning-config.md
+++ b/get-shit-done/references/planning-config.md
@@ -225,7 +225,7 @@ Generated from `CONFIG_DEFAULTS` (core.cjs) and `VALID_CONFIG_KEYS` (config.cjs)
 | Key | Type | Default | Allowed Values | Description |
 |-----|------|---------|----------------|-------------|
 | `model_profile` | string | `"balanced"` | `"quality"`, `"balanced"`, `"budget"`, `"inherit"` | Model selection preset for subagents |
-| `mode` | string | (none) | `"code-first"`, `"plan-first"`, `"hybrid"` | Per-phase workflow mode controlling discuss/plan/execute flow |
+| `mode` | string | `"interactive"` | `"interactive"`, `"yolo"` | Operation mode: `"interactive"` shows gates and confirmations; `"yolo"` runs autonomously without prompts |
 | `granularity` | string | (none) | `"coarse"`, `"standard"`, `"fine"` | Planning depth for phase plans (migrated from deprecated `depth`) |
 | `commit_docs` | boolean | `true` | `true`, `false` | Commit .planning/ artifacts to git (auto-false if .planning/ is gitignored) |
 | `search_gitignored` | boolean | `false` | `true`, `false` | Include gitignored paths in broad rg searches via `--no-ignore` |
@@ -234,6 +234,8 @@ Generated from `CONFIG_DEFAULTS` (core.cjs) and `VALID_CONFIG_KEYS` (config.cjs)
 | `response_language` | string\|null | `null` | Any language name | Language for user-facing prompts (e.g., `"Portuguese"`, `"Japanese"`) |
 | `context_window` | number | `200000` | `200000`, `1000000` | Context window size; set `1000000` for 1M-context models |
 | `resolve_model_ids` | boolean\|string | `false` | `false`, `true`, `"omit"` | Map model aliases to full Claude IDs; `"omit"` returns empty string |
+| `context` | string\|null | `null` | `"dev"`, `"research"`, `"review"` | Execution context profile that adjusts agent behavior: `"dev"` for development tasks, `"research"` for investigation/exploration, `"review"` for code review workflows |
+| `review.models.<cli>` | string\|null | `null` | Any model ID string | Per-CLI model override for /gsd-review (e.g., `review.models.gemini`). Falls back to CLI default when null. |

 ### Workflow Fields

@@ -248,14 +250,17 @@ Set via `workflow.*` namespace in config.json (e.g., `"workflow": { "research":
 | `workflow.auto_advance` | boolean | `false` | `true`, `false` | Auto-advance to next phase after completion |
 | `workflow.node_repair` | boolean | `true` | `true`, `false` | Attempt automatic repair of failed plan nodes |
 | `workflow.node_repair_budget` | number | `2` | Any positive integer | Max repair retries per failed node |
+| `workflow.ai_integration_phase` | boolean | `true` | `true`, `false` | Run /gsd-ai-integration-phase before planning AI system phases |
 | `workflow.ui_phase` | boolean | `true` | `true`, `false` | Generate UI-SPEC.md for frontend phases |
 | `workflow.ui_safety_gate` | boolean | `true` | `true`, `false` | Require safety gate approval for UI changes |
 | `workflow.text_mode` | boolean | `false` | `true`, `false` | Use plain-text numbered lists instead of AskUserQuestion menus |
 | `workflow.research_before_questions` | boolean | `false` | `true`, `false` | Run research before interactive questions in discuss phase |
-| `workflow.discuss_mode` | string | `"discuss"` | `"discuss"`, `"auto"`, `"analyze"` | Default mode for discuss-phase agent |
+| `workflow.discuss_mode` | string | `"discuss"` | `"discuss"`, `"assumptions"` | Default mode for discuss-phase: `"discuss"` runs interactive questioning; `"assumptions"` analyzes codebase and surfaces assumptions instead |
 | `workflow.skip_discuss` | boolean | `false` | `true`, `false` | Skip discuss phase entirely |
 | `workflow.use_worktrees` | boolean | `true` | `true`, `false` | Run executor agents in isolated git worktrees |
 | `workflow.subagent_timeout` | number | `300000` | Any positive integer (ms) | Timeout for parallel subagent tasks (default: 5 minutes) |
+| `workflow.code_review` | boolean | `true` | `true`, `false` | Enable built-in code review step in the ship workflow |
+| `workflow.code_review_depth` | string | `"standard"` | `"light"`, `"standard"`, `"deep"` | Depth level for code review analysis in the ship workflow |
 | `workflow._auto_chain_active` | boolean | `false` | `true`, `false` | Internal: tracks whether autonomous chaining is active |

 ### Git Fields
@@ -287,6 +292,7 @@ Set via `features.*` namespace (e.g., `"features": { "thinking_partner": true }`
 | Key | Type | Default | Allowed Values | Description |
 |-----|------|---------|----------------|-------------|
 | `features.thinking_partner` | boolean | `false` | `true`, `false` | Enable conditional extended thinking at workflow decision points (used by discuss-phase and plan-phase for architectural tradeoff analysis) |
+| `features.global_learnings` | boolean | `false` | `true`, `false` | Enable injection of global learnings from `~/.gsd/learnings/` into agent prompts |

 ### Hook Fields

@@ -296,6 +302,14 @@ Set via `hooks.*` namespace (e.g., `"hooks": { "context_warnings": true }`).
 |-----|------|---------|----------------|-------------|
 | `hooks.context_warnings` | boolean | `true` | `true`, `false` | Show warnings when context budget is exceeded |

+### Learnings Fields
+
+Set via `learnings.*` namespace (e.g., `"learnings": { "max_inject": 5 }`). Used together with `features.global_learnings`.
+
+| Key | Type | Default | Allowed Values | Description |
+|-----|------|---------|----------------|-------------|
+| `learnings.max_inject` | number | `10` | Any positive integer | Maximum number of global learning entries to inject into agent prompts per session |
+
 ### Manager Fields

 Set via `manager.*` namespace (e.g., `"manager": { "flags": { "discuss": "--auto" } }`).
--- a/get-shit-done/references/universal-anti-patterns.md
+++ b/get-shit-done/references/universal-anti-patterns.md
@@ -56,3 +56,8 @@ Reference: `references/questioning.md` for the full anti-pattern list.
 25. **Always use `gsd-tools.cjs`** (not `gsd-tools.js` or any other variant) -- GSD uses CommonJS for Node.js CLI compatibility.
 26. **Plan files MUST follow `{padded_phase}-{NN}-PLAN.md` pattern** (e.g., `01-01-PLAN.md`). Never use `PLAN-01.md`, `plan-01.md`, or any other variation -- gsd-tools detection depends on this exact pattern.
 27. **Do not start executing the next plan before writing the SUMMARY.md for the current plan** -- downstream plans may reference it via `@` includes.
+
+## iOS / Apple Platform Rules
+
+28. **NEVER use `Package.swift` + `.executableTarget` (or `.target`) as the primary build system for iOS apps.** SPM executable targets produce macOS CLI binaries, not iOS `.app` bundles. They cannot be installed on iOS devices or submitted to the App Store. Use XcodeGen (`project.yml` + `xcodegen generate`) to create a proper `.xcodeproj`. See `references/ios-scaffold.md` for the full pattern.
+29. **Verify SwiftUI API availability before use.** Many SwiftUI APIs require a specific minimum iOS version (e.g., `NavigationSplitView` is iOS 16+, `List(selection:)` with multi-select and `@Observable` require iOS 17). If a plan uses an API that exceeds the declared `IPHONEOS_DEPLOYMENT_TARGET`, raise the deployment target or add `#available` guards.
--- a/get-shit-done/templates/AI-SPEC.md
+++ b/get-shit-done/templates/AI-SPEC.md
@@ -0,0 +1,246 @@
+# AI-SPEC — Phase {N}: {phase_name}
+
+> AI design contract generated by `/gsd-ai-integration-phase`. Consumed by `gsd-planner` and `gsd-eval-auditor`.
+> Locks framework selection, implementation guidance, and evaluation strategy before planning begins.
+
+---
+
+## 1. System Classification
+
+**System Type:** <!-- RAG | Multi-Agent | Conversational | Extraction | Autonomous Agent | Content Generation | Code Automation | Hybrid -->
+
+**Description:**
+<!-- One-paragraph description of what this AI system does, who uses it, and what "good" looks like -->
+
+**Critical Failure Modes:**
+<!-- The 3-5 behaviors that absolutely cannot go wrong in this system -->
+1.
+2.
+3.
+
+---
+
+## 1b. Domain Context
+
+> Researched by `gsd-domain-researcher`. Grounds the evaluation strategy in domain expert knowledge.
+
+**Industry Vertical:** <!-- healthcare | legal | finance | customer service | education | developer tooling | e-commerce | etc. -->
+
+**User Population:** <!-- who uses this system and in what context -->
+
+**Stakes Level:** <!-- Low | Medium | High | Critical -->
+
+**Output Consequence:** <!-- what happens downstream when the AI output is acted on -->
+
+### What Domain Experts Evaluate Against
+
+<!-- Domain-specific rubric ingredients — in practitioner language, not AI jargon -->
+<!-- Format: Dimension / Good (expert accepts) / Bad (expert flags) / Stakes / Source -->
+
+### Known Failure Modes in This Domain
+
+<!-- Domain-specific failure modes from research — not generic hallucination, but how it manifests here -->
+
+### Regulatory / Compliance Context
+
+<!-- Relevant regulations or constraints — or "None identified" if genuinely none apply -->
+
+### Domain Expert Roles for Evaluation
+
+| Role | Responsibility |
+|------|---------------|
+| <!-- e.g., Senior practitioner --> | <!-- Dataset labeling / rubric calibration / production sampling --> |
+
+---
+
+## 2. Framework Decision
+
+**Selected Framework:** <!-- e.g., LlamaIndex v0.10.x -->
+
+**Version:** <!-- Pin the version -->
+
+**Rationale:**
+<!-- Why this framework fits this system type, team context, and production requirements -->
+
+**Alternatives Considered:**
+
+| Framework | Ruled Out Because |
+|-----------|------------------|
+| | |
+
+**Vendor Lock-In Accepted:** <!-- Yes / No / Partial — document the trade-off consciously -->
+
+---
+
+## 3. Framework Quick Reference
+
+> Fetched from official docs by `gsd-ai-researcher`. Distilled for this specific use case.
+
+### Installation
+```bash
+# Install command(s)
+```
+
+### Core Imports
+```python
+# Key imports for this use case
+```
+
+### Entry Point Pattern
+```python
+# Minimal working example for this system type
+```
+
+### Key Abstractions
+<!-- Framework-specific concepts the developer must understand before coding -->
+| Concept | What It Is | When You Use It |
+|---------|-----------|-----------------|
+| | | |
+
+### Common Pitfalls
+<!-- Gotchas specific to this framework and system type — from docs, issues, and community reports -->
+1.
+2.
+3.
+
+### Recommended Project Structure
+```
+project/
+├── # Framework-specific folder layout
+```
+
+---
+
+## 4. Implementation Guidance
+
+**Model Configuration:**
+<!-- Which model(s), temperature, max tokens, and other key parameters -->
+
+**Core Pattern:**
+<!-- The primary implementation pattern for this system type in this framework -->
+
+**Tool Use:**
+<!-- Tools/integrations needed and how to configure them -->
+
+**State Management:**
+<!-- How state is persisted, retrieved, and updated -->
+
+**Context Window Strategy:**
+<!-- How to manage context limits for this system type -->
+
+---
+
+## 4b. AI Systems Best Practices
+
+> Written by `gsd-ai-researcher`. Cross-cutting patterns every developer building AI systems needs — independent of framework choice.
+
+### Structured Outputs with Pydantic
+
+<!-- Framework-specific Pydantic integration pattern for this use case -->
+<!-- Include: output model definition, how the framework uses it, retry logic on validation failure -->
+
+```python
+# Pydantic output model for this system type
+```
+
+### Async-First Design
+
+<!-- How async is handled in this framework, the one common mistake, and when to stream vs. await -->
+
+### Prompt Engineering Discipline
+
+<!-- System vs. user prompt separation, few-shot guidance, token budget strategy -->
+
+### Context Window Management
+
+<!-- Strategy specific to this system type: RAG chunking / conversation summarisation / agent compaction -->
+
+### Cost and Latency Budget
+
+<!-- Per-call cost estimate, caching strategy, sub-task model routing -->
+
+---
+
+## 5. Evaluation Strategy
+
+### Dimensions
+
+| Dimension | Rubric (Pass/Fail or 1-5) | Measurement Approach | Priority |
+|-----------|--------------------------|---------------------|----------|
+| | | Code / LLM Judge / Human | Critical / High / Medium |
+
+### Eval Tooling
+
+**Primary Tool:** <!-- e.g., RAGAS + Langfuse -->
+
+**Setup:**
+```bash
+# Install and configure
+```
+
+**CI/CD Integration:**
+```bash
+# Command to run evals in CI/CD pipeline
+```
+
+### Reference Dataset
+
+**Size:** <!-- e.g., 20 examples to start -->
+
+**Composition:**
+<!-- What scenario types the dataset covers: critical paths, edge cases, failure modes -->
+
+**Labeling:**
+<!-- Who labels examples and how (domain expert, LLM judge with calibration, etc.) -->
+
+---
+
+## 6. Guardrails
+
+### Online (Real-Time)
+
+| Guardrail | Trigger | Intervention |
+|-----------|---------|--------------|
+| | | Block / Escalate / Flag |
+
+### Offline (Flywheel)
+
+| Metric | Sampling Strategy | Action on Degradation |
+|--------|------------------|----------------------|
+| | | |
+
+---
+
+## 7. Production Monitoring
+
+**Tracing Tool:** <!-- e.g., Langfuse self-hosted -->
+
+**Key Metrics to Track:**
+<!-- 3-5 metrics that will be monitored in production -->
+
+**Alert Thresholds:**
+<!-- When to page/alert -->
+
+**Smart Sampling Strategy:**
+<!-- How to select interactions for human review — signal-based filters -->
+
+---
+
+## Checklist
+
+- [ ] System type classified
+- [ ] Critical failure modes identified (≥ 3)
+- [ ] Domain context researched (Section 1b: vertical, stakes, expert criteria, failure modes)
+- [ ] Regulatory/compliance context identified or explicitly noted as none
+- [ ] Domain expert roles defined for evaluation involvement
+- [ ] Framework selected with rationale documented
+- [ ] Alternatives considered and ruled out
+- [ ] Framework quick reference written (install, imports, pattern, pitfalls)
+- [ ] AI systems best practices written (Section 4b: Pydantic, async, prompt discipline, context)
+- [ ] Evaluation dimensions grounded in domain rubric ingredients
+- [ ] Each eval dimension has a concrete rubric (Good/Bad in domain language)
+- [ ] Eval tooling selected — Arize Phoenix default confirmed or override noted
+- [ ] Reference dataset spec written (size ≥ 10, composition + labeling defined)
+- [ ] CI/CD eval integration specified
+- [ ] Online guardrails defined
+- [ ] Production monitoring configured (tracing tool + sampling strategy)
--- a/get-shit-done/workflows/add-tests.md
+++ b/get-shit-done/workflows/add-tests.md
@@ -108,6 +108,9 @@ Read each file to verify classification. Don't classify based on filename alone.
 <step name="present_classification">
 Present the classification to the user for confirmation before proceeding:

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
+
 ```
 AskUserQuestion(
  header: "Test Classification",
--- a/get-shit-done/workflows/add-todo.md
+++ b/get-shit-done/workflows/add-todo.md
@@ -70,6 +70,8 @@ If potential duplicate found:
 1. Read the existing todo
 2. Compare scope

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 If overlapping, use AskUserQuestion:
 - header: "Duplicate?"
 - question: "Similar todo exists: [title]. What would you like to do?"
--- a/get-shit-done/workflows/ai-integration-phase.md
+++ b/get-shit-done/workflows/ai-integration-phase.md
@@ -0,0 +1,284 @@
+<purpose>
+Generate an AI design contract (AI-SPEC.md) for phases that involve building AI systems. Orchestrates gsd-framework-selector → gsd-ai-researcher → gsd-domain-researcher → gsd-eval-planner with a validation gate. Inserts between discuss-phase and plan-phase in the GSD lifecycle.
+
+AI-SPEC.md locks four things before the planner creates tasks:
+1. Framework selection (with rationale and alternatives)
+2. Implementation guidance (correct syntax, patterns, pitfalls from official docs)
+3. Domain context (practitioner rubric ingredients, failure modes, regulatory constraints)
+4. Evaluation strategy (dimensions, rubrics, tooling, reference dataset, guardrails)
+
+This prevents the two most common AI development failures: choosing the wrong framework for the use case, and treating evaluation as an afterthought.
+</purpose>
+
+<required_reading>
+@~/.claude/get-shit-done/references/ai-frameworks.md
+@~/.claude/get-shit-done/references/ai-evals.md
+</required_reading>
+
+<process>
+
+## 1. Initialize
+
+```bash
+INIT=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" init plan-phase "$PHASE")
+if [[ "$INIT" == @file:* ]]; then INIT=$(cat "${INIT#@file:}"); fi
+```
+
+Parse JSON for: `phase_dir`, `phase_number`, `phase_name`, `phase_slug`, `padded_phase`, `has_context`, `has_research`, `commit_docs`.
+
+**File paths:** `state_path`, `roadmap_path`, `requirements_path`, `context_path`.
+
+Resolve agent models:
+```bash
+SELECTOR_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" resolve-model gsd-framework-selector --raw)
+RESEARCHER_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" resolve-model gsd-ai-researcher --raw)
+DOMAIN_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" resolve-model gsd-domain-researcher --raw)
+PLANNER_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" resolve-model gsd-eval-planner --raw)
+```
+
+Check config:
+```bash
+AI_PHASE_ENABLED=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" config-get workflow.ai_integration_phase 2>/dev/null || echo "true")
+```
+
+**If `AI_PHASE_ENABLED` is `false`:**
+```
+AI phase is disabled in config. Enable via /gsd-settings.
+```
+Exit workflow.
+
+**If `planning_exists` is false:** Error — run `/gsd-new-project` first.
+
+## 2. Parse and Validate Phase
+
+Extract phase number from $ARGUMENTS. If not provided, detect next unplanned phase.
+
+```bash
+PHASE_INFO=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" roadmap get-phase "${PHASE}")
+```
+
+**If `found` is false:** Error with available phases.
+
+## 3. Check Prerequisites
+
+**If `has_context` is false:**
+```
+No CONTEXT.md found for Phase {N}.
+Recommended: run /gsd-discuss-phase {N} first to capture framework preferences.
+Continuing without user decisions — framework selector will ask all questions.
+```
+Continue (non-blocking).
+
+## 4. Check Existing AI-SPEC
+
+```bash
+AI_SPEC_FILE=$(ls "${PHASE_DIR}"/*-AI-SPEC.md 2>/dev/null | head -1)
+```
+
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
+**If exists:** Use AskUserQuestion:
+- header: "Existing AI-SPEC"
+- question: "AI-SPEC.md already exists for Phase {N}. What would you like to do?"
+- options:
+  - "Update — re-run with existing as baseline"
+  - "View — display current AI-SPEC and exit"
+  - "Skip — keep current AI-SPEC and exit"
+
+If "View": display file contents, exit.
+If "Skip": exit.
+If "Update": continue to step 5.
+
+## 5. Spawn gsd-framework-selector
+
+Display:
+```
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+ GSD ► AI DESIGN CONTRACT — PHASE {N}: {name}
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+◆ Step 1/4 — Framework Selection...
+```
+
+Spawn `gsd-framework-selector` with:
+```markdown
+Read ~/.claude/agents/gsd-framework-selector.md for instructions.
+
+<objective>
+Select the right AI framework for Phase {phase_number}: {phase_name}
+Goal: {phase_goal}
+</objective>
+
+<files_to_read>
+{context_path if exists}
+{requirements_path if exists}
+</files_to_read>
+
+<phase_context>
+Phase: {phase_number} — {phase_name}
+Goal: {phase_goal}
+</phase_context>
+```
+
+Parse selector output for: `primary_framework`, `system_type`, `model_provider`, `eval_concerns`, `alternative_framework`.
+
+**If selector fails or returns empty:** Exit with error — "Framework selection failed. Re-run /gsd-ai-integration-phase {N} or answer the framework question in /gsd-discuss-phase {N} first."
+
+## 6. Initialize AI-SPEC.md
+
+Copy template:
+```bash
+cp "$HOME/.claude/get-shit-done/templates/AI-SPEC.md" "${PHASE_DIR}/${PADDED_PHASE}-AI-SPEC.md"
+```
+
+Fill in header fields:
+- Phase number and name
+- System classification (from selector)
+- Selected framework (from selector)
+- Alternative considered (from selector)
+
+## 7. Spawn gsd-ai-researcher
+
+Display:
+```
+◆ Step 2/4 — Researching {primary_framework} docs + AI systems best practices...
+```
+
+Spawn `gsd-ai-researcher` with:
+```markdown
+Read ~/.claude/agents/gsd-ai-researcher.md for instructions.
+
+<objective>
+Research {primary_framework} for Phase {phase_number}: {phase_name}
+Write Sections 3 and 4 of AI-SPEC.md
+</objective>
+
+<files_to_read>
+{ai_spec_path}
+{context_path if exists}
+</files_to_read>
+
+<input>
+framework: {primary_framework}
+system_type: {system_type}
+model_provider: {model_provider}
+ai_spec_path: {ai_spec_path}
+phase_context: Phase {phase_number}: {phase_name} — {phase_goal}
+</input>
+```
+
+## 8. Spawn gsd-domain-researcher
+
+Display:
+```
+◆ Step 3/4 — Researching domain context and expert evaluation criteria...
+```
+
+Spawn `gsd-domain-researcher` with:
+```markdown
+Read ~/.claude/agents/gsd-domain-researcher.md for instructions.
+
+<objective>
+Research the business domain and expert evaluation criteria for Phase {phase_number}: {phase_name}
+Write Section 1b (Domain Context) of AI-SPEC.md
+</objective>
+
+<files_to_read>
+{ai_spec_path}
+{context_path if exists}
+{requirements_path if exists}
+</files_to_read>
+
+<input>
+system_type: {system_type}
+phase_name: {phase_name}
+phase_goal: {phase_goal}
+ai_spec_path: {ai_spec_path}
+</input>
+```
+
+## 9. Spawn gsd-eval-planner
+
+Display:
+```
+◆ Step 4/4 — Designing evaluation strategy from domain + technical context...
+```
+
+Spawn `gsd-eval-planner` with:
+```markdown
+Read ~/.claude/agents/gsd-eval-planner.md for instructions.
+
+<objective>
+Design evaluation strategy for Phase {phase_number}: {phase_name}
+Write Sections 5, 6, and 7 of AI-SPEC.md
+AI-SPEC.md now contains domain context (Section 1b) — use it as your rubric starting point.
+</objective>
+
+<files_to_read>
+{ai_spec_path}
+{context_path if exists}
+{requirements_path if exists}
+</files_to_read>
+
+<input>
+system_type: {system_type}
+framework: {primary_framework}
+model_provider: {model_provider}
+phase_name: {phase_name}
+phase_goal: {phase_goal}
+ai_spec_path: {ai_spec_path}
+</input>
+```
+
+## 10. Validate AI-SPEC Completeness
+
+Read the completed AI-SPEC.md. Check that:
+- Section 2 has a framework name (not placeholder)
+- Section 1b has at least one domain rubric ingredient (Good/Bad/Stakes)
+- Section 3 has a non-empty code block (entry point pattern)
+- Section 4b has a Pydantic example
+- Section 5 has at least one row in the dimensions table
+- Section 6 has at least one guardrail or explicit "N/A for internal tool" note
+- Checklist section at end has 3+ items checked
+
+**If validation fails:** Display specific missing sections. Ask user if they want to re-run the specific step or continue anyway.
+
+## 11. Commit
+
+**If `commit_docs` is true:**
+```bash
+git add "${AI_SPEC_FILE}"
+git commit -m "docs({phase_slug}): generate AI-SPEC.md — {primary_framework} + domain context + eval strategy"
+```
+
+## 12. Display Completion
+
+```
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+ GSD ► AI-SPEC COMPLETE — PHASE {N}: {name}
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+◆ Framework: {primary_framework}
+◆ System Type: {system_type}
+◆ Domain: {domain_vertical from Section 1b}
+◆ Eval Dimensions: {eval_concerns}
+◆ Tracing Default: Arize Phoenix (or detected existing tool)
+◆ Output: {ai_spec_path}
+
+Next step:
+  /gsd-plan-phase {N}   — planner will consume AI-SPEC.md
+```
+
+</process>
+
+<success_criteria>
+- [ ] Framework selected with rationale (Section 2)
+- [ ] AI-SPEC.md created from template
+- [ ] Framework docs + AI best practices researched (Sections 3, 4, 4b populated)
+- [ ] Domain context + expert rubric ingredients researched (Section 1b populated)
+- [ ] Eval strategy grounded in domain context (Sections 5-7 populated)
+- [ ] Arize Phoenix (or detected tool) set as tracing default in Section 7
+- [ ] AI-SPEC.md validated (Sections 1b, 2, 3, 4b, 5, 6 all non-empty)
+- [ ] Committed if commit_docs enabled
+- [ ] Next step surfaced to user
+</success_criteria>
--- a/get-shit-done/workflows/autonomous.md
+++ b/get-shit-done/workflows/autonomous.md
@@ -412,6 +412,8 @@ Proceed to iterate step.

 Read the human_verification section from VERIFICATION.md to get the count and items requiring manual testing.

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Display the items, then ask user via AskUserQuestion:
 - **question:** "Phase ${PHASE_NUM} has items needing manual verification. Validate now or continue to next phase?"
 - **options:** "Validate now" / "Continue without validation"
--- a/get-shit-done/workflows/check-todos.md
+++ b/get-shit-done/workflows/check-todos.md
@@ -102,6 +102,8 @@ If `.planning/ROADMAP.md` exists:
 <step name="offer_actions">
 **If todo maps to a roadmap phase:**

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion:
 - header: "Action"
 - question: "This todo relates to Phase [N]: [name]. What would you like to do?"
--- a/get-shit-done/workflows/cleanup.md
+++ b/get-shit-done/workflows/cleanup.md
@@ -93,6 +93,8 @@ No phase directories found to archive. Phases may have been removed or archived

 Stop here.

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 AskUserQuestion: "Proceed with archiving?" with options: "Yes — archive listed phases" | "Cancel"

 If "Cancel": Stop.
--- a/get-shit-done/workflows/complete-milestone.md
+++ b/get-shit-done/workflows/complete-milestone.md
@@ -20,8 +20,8 @@ When a milestone completes:

 1. Extract full milestone details to `.planning/milestones/v[X.Y]-ROADMAP.md`
 2. Archive requirements to `.planning/milestones/v[X.Y]-REQUIREMENTS.md`
-3. Update ROADMAP.md — replace milestone details with one-line summary
-4. Delete REQUIREMENTS.md (fresh one for next milestone)
+3. Update ROADMAP.md — overwrite in place with milestone grouping (preserve Backlog section)
+4. Safety commit archive files + updated ROADMAP.md, then `git rm REQUIREMENTS.md` (fresh for next milestone)
 5. Perform full PROJECT.md evolution review
 6. Offer to create next milestone inline
 7. Archive UI artifacts (`*-UI-SPEC.md`, `*-UI-REVIEW.md`) alongside other phase documents
@@ -387,6 +387,8 @@ Verify: `✅ Milestone archived to .planning/milestones/`

 **Phase archival (optional):** After archival completes, ask the user:

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 AskUserQuestion(header="Archive Phases", question="Archive phase directories to milestones/?", options: "Yes — move to milestones/v[X.Y]-phases/" | "Skip — keep phases in place")

 If "Yes": move phase directories to the milestone archive:
@@ -400,18 +402,29 @@ Verify: `✅ Phase directories archived to .planning/milestones/v[X.Y]-phases/`
 If "Skip": Phase directories remain in `.planning/phases/` as raw execution history. Use `/gsd-cleanup` later to archive retroactively.

 After archival, the AI still handles:
- Reorganizing ROADMAP.md with milestone grouping (requires judgment)
+- Reorganizing ROADMAP.md with milestone grouping (requires judgment) — overwrite in place after extracting Backlog section
 - Full PROJECT.md evolution review (requires understanding)
- Deleting original ROADMAP.md and REQUIREMENTS.md
+- Safety commit of archive files + updated ROADMAP.md, then `git rm .planning/REQUIREMENTS.md`
 - These are NOT fully delegated because they require AI interpretation of content

 </step>

 <step name="reorganize_roadmap_and_delete_originals">

-After `milestone complete` has archived, reorganize ROADMAP.md with milestone groupings, then delete originals:
+After `milestone complete` has archived, reorganize ROADMAP.md with milestone groupings, then commit archives as a safety checkpoint before removing originals.

-**Reorganize ROADMAP.md** — group completed milestone phases:
+**Backlog preservation — do this FIRST before rewriting ROADMAP.md:**
+
+Extract the Backlog section from the current ROADMAP.md before making any changes:
+
+```bash
+# Extract lines under ## Backlog through end of file (or next ## section)
+BACKLOG_SECTION=$(awk '/^## Backlog/{found=1} found{print}' .planning/ROADMAP.md)
+```
+
+If `$BACKLOG_SECTION` is empty, there is no Backlog section — skip silently.
+
+**Reorganize ROADMAP.md** — overwrite in place (do NOT delete first) with milestone groupings:

 ```markdown
 # Roadmap: [Project Name]
@@ -432,11 +445,22 @@ After `milestone complete` has archived, reorganize ROADMAP.md with milestone gr
 </details>
 ```

-**Then delete originals:**
+**Re-append Backlog section after the rewrite** (only if `$BACKLOG_SECTION` was non-empty):
+
+Append the extracted Backlog content verbatim to the end of the newly written ROADMAP.md. This ensures 999.x backlog items are never silently dropped during milestone reorganization.
+
+**Safety commit — commit archive files BEFORE deleting any originals:**

 ```bash
-rm .planning/ROADMAP.md
-rm .planning/REQUIREMENTS.md
+node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" commit "chore: archive v[X.Y] milestone files" --files .planning/milestones/v[X.Y]-ROADMAP.md .planning/milestones/v[X.Y]-REQUIREMENTS.md .planning/milestones/v[X.Y]-MILESTONE-AUDIT.md .planning/MILESTONES.md .planning/PROJECT.md .planning/STATE.md .planning/ROADMAP.md
+```
+
+This creates a durable checkpoint in git history. If anything fails after this point, the working tree can be reconstructed from git.
+
+**Remove REQUIREMENTS.md via git rm** (preserves history, stages deletion atomically):
+
+```bash
+git rm .planning/REQUIREMENTS.md
 ```

 </step>
@@ -686,14 +710,13 @@ git push origin v[X.Y]

 <step name="git_commit_milestone">

-Commit milestone completion.
+Commit the REQUIREMENTS.md deletion (archive files and ROADMAP.md were already committed in the safety commit in `reorganize_roadmap_and_delete_originals`).

 ```bash
-node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" commit "chore: complete v[X.Y] milestone" --files .planning/milestones/v[X.Y]-ROADMAP.md .planning/milestones/v[X.Y]-REQUIREMENTS.md .planning/milestones/v[X.Y]-MILESTONE-AUDIT.md .planning/MILESTONES.md .planning/PROJECT.md .planning/STATE.md
-```
+git commit -m "chore: remove REQUIREMENTS.md for v[X.Y] milestone"
 ```

-Confirm: "Committed: chore: complete v[X.Y] milestone"
+Confirm: "Committed: chore: remove REQUIREMENTS.md for v[X.Y] milestone"

 </step>

@@ -759,10 +782,12 @@ Milestone completion is successful when:
 - [ ] PROJECT.md full evolution review completed
 - [ ] All shipped requirements moved to Validated in PROJECT.md
 - [ ] Key Decisions updated with outcomes
- [ ] ROADMAP.md reorganized with milestone grouping
+- [ ] ROADMAP.md Backlog section extracted before rewrite, re-appended after (skipped if absent)
+- [ ] ROADMAP.md reorganized with milestone grouping (overwritten in place, not deleted)
 - [ ] Roadmap archive created (milestones/v[X.Y]-ROADMAP.md)
 - [ ] Requirements archive created (milestones/v[X.Y]-REQUIREMENTS.md)
- [ ] REQUIREMENTS.md deleted (fresh for next milestone)
+- [ ] Safety commit made (archive files + updated ROADMAP.md) BEFORE deleting REQUIREMENTS.md
+- [ ] REQUIREMENTS.md removed via `git rm` (fresh for next milestone, history preserved)
 - [ ] STATE.md updated with fresh project reference
 - [ ] Git tag created (v[X.Y])
 - [ ] Milestone commit made (includes archive files and deletion)
--- a/get-shit-done/workflows/diagnose-issues.md
+++ b/get-shit-done/workflows/diagnose-issues.md
@@ -97,7 +97,7 @@ For each gap, fill the debug-subagent-prompt template and spawn:

 ```
 Task(
-  prompt=filled_debug_subagent_prompt + "\n\n<worktree_branch_check>\nFIRST ACTION: run git merge-base HEAD {EXPECTED_BASE} — if result differs from {EXPECTED_BASE}, run git reset --soft {EXPECTED_BASE} to correct the branch base (fixes Windows EnterWorktree creating branches from main).\n</worktree_branch_check>\n\n<files_to_read>\n- {phase_dir}/{phase_num}-UAT.md\n- .planning/STATE.md\n</files_to_read>\n${AGENT_SKILLS_DEBUGGER}",
+  prompt=filled_debug_subagent_prompt + "\n\n<worktree_branch_check>\nFIRST ACTION: run git merge-base HEAD {EXPECTED_BASE} — if result differs from {EXPECTED_BASE}, run git reset --hard {EXPECTED_BASE} to correct the branch base (safe — runs before any agent work). Then verify: if [ \"$(git rev-parse HEAD)\" != \"{EXPECTED_BASE}\" ]; then echo \"ERROR: Could not correct worktree base\"; exit 1; fi. Fixes EnterWorktree creating branches from main on all platforms.\n</worktree_branch_check>\n\n<files_to_read>\n- {phase_dir}/{phase_num}-UAT.md\n- .planning/STATE.md\n</files_to_read>\n${AGENT_SKILLS_DEBUGGER}",
  subagent_type="gsd-debugger",
  ${USE_WORKTREES !== "false" ? 'isolation="worktree",' : ''}
  description="Debug: {truth_short}"
--- a/get-shit-done/workflows/discovery-phase.md
+++ b/get-shit-done/workflows/discovery-phase.md
@@ -214,6 +214,8 @@ Write `.planning/phases/XX-name/DISCOVERY.md`:
 After creating DISCOVERY.md, check confidence level.

 If confidence is LOW:
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion:

 - header: "Low Conf."
--- a/get-shit-done/workflows/do.md
+++ b/get-shit-done/workflows/do.md
@@ -11,6 +11,8 @@ Read all files referenced by the invoking prompt's execution_context before star
 <step name="validate">
 **Check for input.**

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 If `$ARGUMENTS` is empty, ask via AskUserQuestion:

 ```
--- a/get-shit-done/workflows/docs-update.md
+++ b/get-shit-done/workflows/docs-update.md
@@ -84,6 +84,8 @@ Assemble the complete doc queue from always-on docs plus conditional docs from c
 If CONTRIBUTING.md is in the conditional queue AND does NOT appear in the `existing_docs` array from init JSON:

 1. If `--force` is present in `$ARGUMENTS`: skip this check, include CONTRIBUTING.md in the queue.
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 2. Otherwise, use AskUserQuestion to confirm:

 ```
--- a/get-shit-done/workflows/eval-review.md
+++ b/get-shit-done/workflows/eval-review.md
@@ -0,0 +1,155 @@
+<purpose>
+Retroactive audit of an implemented AI phase's evaluation coverage. Standalone command that works on any GSD-managed AI phase. Produces a scored EVAL-REVIEW.md with gap analysis and remediation plan.
+
+Use after /gsd-execute-phase to verify that the evaluation strategy from AI-SPEC.md was actually implemented. Mirrors the pattern of /gsd-ui-review and /gsd-validate-phase.
+</purpose>
+
+<required_reading>
+@~/.claude/get-shit-done/references/ai-evals.md
+</required_reading>
+
+<process>
+
+## 0. Initialize
+
+```bash
+INIT=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" init phase-op "${PHASE_ARG}")
+if [[ "$INIT" == @file:* ]]; then INIT=$(cat "${INIT#@file:}"); fi
+```
+
+Parse: `phase_dir`, `phase_number`, `phase_name`, `phase_slug`, `padded_phase`, `commit_docs`.
+
+```bash
+AUDITOR_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" resolve-model gsd-eval-auditor --raw)
+```
+
+Display banner:
+```
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+ GSD ► EVAL AUDIT — PHASE {N}: {name}
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+```
+
+## 1. Detect Input State
+
+```bash
+SUMMARY_FILES=$(ls "${PHASE_DIR}"/*-SUMMARY.md 2>/dev/null)
+AI_SPEC_FILE=$(ls "${PHASE_DIR}"/*-AI-SPEC.md 2>/dev/null | head -1)
+EVAL_REVIEW_FILE=$(ls "${PHASE_DIR}"/*-EVAL-REVIEW.md 2>/dev/null | head -1)
+```
+
+**State A** — AI-SPEC.md + SUMMARY.md exist: Full audit against spec
+**State B** — SUMMARY.md exists, no AI-SPEC.md: Audit against general best practices
+**State C** — No SUMMARY.md: Exit — "Phase {N} not executed. Run /gsd-execute-phase {N} first."
+
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
+**If `EVAL_REVIEW_FILE` non-empty:** Use AskUserQuestion:
+- header: "Existing Eval Review"
+- question: "EVAL-REVIEW.md already exists for Phase {N}."
+- options:
+  - "Re-audit — run fresh audit"
+  - "View — display current review and exit"
+
+If "View": display file, exit.
+If "Re-audit": continue.
+
+**If State B (no AI-SPEC.md):** Warn:
+```
+No AI-SPEC.md found for Phase {N}.
+Audit will evaluate against general AI eval best practices rather than a phase-specific plan.
+Consider running /gsd-ai-integration-phase {N} before implementation next time.
+```
+Continue (non-blocking).
+
+## 2. Gather Context Paths
+
+Build file list for auditor:
+- AI-SPEC.md (if exists — the planned eval strategy)
+- All SUMMARY.md files in phase dir
+- All PLAN.md files in phase dir
+
+## 3. Spawn gsd-eval-auditor
+
+```
+◆ Spawning eval auditor...
+```
+
+Build prompt:
+
+```markdown
+Read ~/.claude/agents/gsd-eval-auditor.md for instructions.
+
+<objective>
+Conduct evaluation coverage audit of Phase {phase_number}: {phase_name}
+{If AI-SPEC exists: "Audit against AI-SPEC.md evaluation plan."}
+{If no AI-SPEC: "Audit against general AI eval best practices."}
+</objective>
+
+<files_to_read>
+- {summary_paths}
+- {plan_paths}
+- {ai_spec_path if exists}
+</files_to_read>
+
+<input>
+ai_spec_path: {ai_spec_path or "none"}
+phase_dir: {phase_dir}
+phase_number: {phase_number}
+phase_name: {phase_name}
+padded_phase: {padded_phase}
+state: {A or B}
+</input>
+```
+
+Spawn as Task with model `AUDITOR_MODEL`.
+
+## 4. Parse Auditor Result
+
+Read the written EVAL-REVIEW.md. Extract:
+- `overall_score`
+- `verdict` (PRODUCTION READY | NEEDS WORK | SIGNIFICANT GAPS | NOT IMPLEMENTED)
+- `critical_gap_count`
+
+## 5. Display Summary
+
+```
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+ GSD ► EVAL AUDIT COMPLETE — PHASE {N}: {name}
+━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
+
+◆ Score: {overall_score}/100
+◆ Verdict: {verdict}
+◆ Critical Gaps: {critical_gap_count}
+◆ Output: {eval_review_path}
+
+{If PRODUCTION READY:}
+  Next step: /gsd-plan-phase (next phase) or deploy
+
+{If NEEDS WORK:}
+  Address critical gaps in EVAL-REVIEW.md, then re-run /gsd-eval-review {N}
+
+{If SIGNIFICANT GAPS or NOT IMPLEMENTED:}
+  Review AI-SPEC.md evaluation plan. Critical eval dimensions are not implemented.
+  Do not deploy until gaps are addressed.
+```
+
+## 6. Commit
+
+**If `commit_docs` is true:**
+```bash
+git add "${EVAL_REVIEW_FILE}"
+git commit -m "docs({phase_slug}): add EVAL-REVIEW.md — score {overall_score}/100 ({verdict})"
+```
+
+</process>
+
+<success_criteria>
+- [ ] Phase execution state detected correctly
+- [ ] AI-SPEC.md presence handled (with or without)
+- [ ] gsd-eval-auditor spawned with correct context
+- [ ] EVAL-REVIEW.md written (by auditor)
+- [ ] Score and verdict displayed to user
+- [ ] Appropriate next steps surfaced based on verdict
+- [ ] Committed if commit_docs enabled
+</success_criteria>
--- a/get-shit-done/workflows/execute-phase.md
+++ b/get-shit-done/workflows/execute-phase.md
@@ -28,6 +28,7 @@ Read STATE.md before any operation to load project context.

@~/.claude/get-shit-done/references/agent-contracts.md
@~/.claude/get-shit-done/references/context-budget.md
+@~/.claude/get-shit-done/references/gates.md
 </required_reading>

 <available_agent_types>
@@ -347,29 +348,40 @@ Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZAT
       Run:
       ```bash
       ACTUAL_BASE=$(git merge-base HEAD {EXPECTED_BASE})
-       CURRENT_HEAD=$(git rev-parse HEAD)
       ```

       If `ACTUAL_BASE` != `{EXPECTED_BASE}` (i.e. the worktree branch was created from an older
-       base such as `main` instead of the feature branch HEAD), rebase onto the correct base:
+       base such as `main` instead of the feature branch HEAD), hard-reset to the correct base:
       ```bash
-       git rebase --onto {EXPECTED_BASE} $(git rev-parse --abbrev-ref HEAD~1 2>/dev/null || git rev-parse HEAD^) HEAD 2>/dev/null || true
-       # If rebase fails or is a no-op, reset the branch to start from the correct base:
-       git reset --soft {EXPECTED_BASE}
+       # Safe: this runs before any agent work, so no uncommitted changes to lose
+       git reset --hard {EXPECTED_BASE}
+       # Verify correction succeeded
+       if [ "$(git rev-parse HEAD)" != "{EXPECTED_BASE}" ]; then
+         echo "ERROR: Could not correct worktree base — aborting to prevent data loss"
+         exit 1
+       fi
       ```

+       `reset --hard` is safe here because this is a fresh worktree with no user changes. It
+       resets both the HEAD pointer AND the working tree to the correct base commit (#2015).
+
       If `ACTUAL_BASE` == `{EXPECTED_BASE}`: the branch base is correct, proceed immediately.

-       This check fixes a known issue on Windows where `EnterWorktree` creates branches from
-       `main` instead of the current feature branch HEAD.
+       This check fixes a known issue where `EnterWorktree` creates branches from
+       `main` instead of the current feature branch HEAD (affects all platforms).
       </worktree_branch_check>

       <parallel_execution>
-       You are running as a PARALLEL executor agent. Use --no-verify on all git
-       commits to avoid pre-commit hook contention with other agents. The
-       orchestrator validates hooks once after all agents complete.
+       You are running as a PARALLEL executor agent in a git worktree.
+       Use --no-verify on all git commits to avoid pre-commit hook contention
+       with other agents. The orchestrator validates hooks once after all agents complete.
       For gsd-tools commits: add --no-verify flag.
       For direct git commits: use git commit --no-verify -m "..."
+
+       IMPORTANT: Do NOT modify STATE.md or ROADMAP.md. execute-plan.md
+       auto-detects worktree mode (`.git` is a file, not a directory) and skips
+       shared file updates automatically. The orchestrator updates them centrally
+       after merge.
       </parallel_execution>

       <execution_context>
@@ -407,6 +419,7 @@ Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZAT
       - [ ] All tasks executed
       - [ ] Each task committed individually
       - [ ] SUMMARY.md created in plan directory
+       - [ ] No modifications to shared orchestrator artifacts (the orchestrator handles all post-wave shared-file writes)
       </success_criteria>
     "
   )
@@ -497,6 +510,15 @@ Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZAT
       # Snapshot list of files on main BEFORE merge to detect resurrections
       PRE_MERGE_FILES=$(git ls-files .planning/)

+       # Pre-merge deletion check: warn if the worktree branch deletes tracked files
+       DELETIONS=$(git diff --diff-filter=D --name-only HEAD..."$WT_BRANCH" 2>/dev/null || true)
+       if [ -n "$DELETIONS" ]; then
+         echo "BLOCKED: Worktree branch $WT_BRANCH contains file deletions: $DELETIONS"
+         echo "Review these deletions before merging. If intentional, remove this guard and re-run."
+         rm -f "$STATE_BACKUP" "$ROADMAP_BACKUP"
+         continue
+       fi
+
       # Merge the worktree branch into the current branch
       git merge "$WT_BRANCH" --no-edit -m "chore: merge executor worktree ($WT_BRANCH)" 2>&1 || {
         echo "⚠ Merge conflict from worktree $WT_BRANCH — resolve manually"
@@ -547,22 +569,113 @@ Execute each selected wave in sequence. Within a wave: parallel if `PARALLELIZAT

   **If no worktrees found:** Skip silently — agents may have been spawned without worktree isolation.

-5.6. **Post-wave shared artifact update (worktree mode only):**
+5.6. **Post-merge test gate (parallel mode only):**

-   When executor agents ran with `isolation="worktree"`, they skipped STATE.md and ROADMAP.md updates to avoid last-merge-wins overwrites. The orchestrator is the single writer for these files. After worktrees are merged back, update shared artifacts once:
+   After merging all worktrees in a wave, run the project's test suite to catch
+   cross-plan integration issues that individual worktree self-checks cannot detect
+   (e.g., conflicting type definitions, removed exports, import changes).
+
+   This addresses the Generator self-evaluation blind spot identified in Anthropic's
+   harness engineering research: agents reliably report Self-Check: PASSED even when
+   merging their work creates failures.

   ```bash
-   # Update ROADMAP.md for each completed plan in this wave
-   for PLAN_ID in ${WAVE_PLAN_IDS}; do
-     node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" roadmap update-plan-progress "${PHASE_NUMBER}" "${PLAN_ID}" completed
-   done
+   # Detect test runner and run quick smoke test (timeout: 5 minutes)
+   TEST_EXIT=0
+   timeout 300 bash -c '
+   if [ -f "package.json" ]; then
+     npm test 2>&1
+   elif [ -f "Cargo.toml" ]; then
+     cargo test 2>&1
+   elif [ -f "go.mod" ]; then
+     go test ./... 2>&1
+   elif [ -f "pyproject.toml" ] || [ -f "requirements.txt" ]; then
+     python -m pytest -x -q --tb=short 2>&1 || uv run python -m pytest -x -q --tb=short 2>&1
+   else
+     echo "⚠ No test runner detected — skipping post-merge test gate"
+     exit 0
+   fi
+   '
+   TEST_EXIT=$?
+   if [ "${TEST_EXIT}" -eq 0 ]; then
+     echo "✓ Post-merge test gate passed — no cross-plan conflicts"
+   elif [ "${TEST_EXIT}" -eq 124 ]; then
+     echo "⚠ Post-merge test gate timed out after 5 minutes"
+   else
+     echo "✗ Post-merge test gate failed (exit code ${TEST_EXIT})"
+     WAVE_FAILURE_COUNT=$((WAVE_FAILURE_COUNT + 1))
+   fi
+   ```

+   **If `TEST_EXIT` is 0 (pass):** `✓ Post-merge test gate: {N} tests passed — no cross-plan conflicts` → continue to orchestrator tracking update.
+
+   **If `TEST_EXIT` is 124 (timeout):** Log warning, treat as non-blocking, continue. Tests may need a longer budget or manual run.
+
+   **If `TEST_EXIT` is non-zero (test failure):** Increment `WAVE_FAILURE_COUNT` to track
+   cumulative failures across waves. Subsequent waves should report:
+   `⚠ Note: ${WAVE_FAILURE_COUNT} prior wave(s) had test failures`
+
+5.7. **Post-wave shared artifact update (worktree mode only, skip if tests failed):**
+
+   When executor agents ran with `isolation="worktree"`, they skipped STATE.md and ROADMAP.md updates to avoid last-merge-wins overwrites. The orchestrator is the single writer for these files. After worktrees are merged back, update shared artifacts once.
+
+   **Only update tracking when tests passed (TEST_EXIT=0).**
+   If tests failed or timed out, skip the tracking update — plans should
+   not be marked as complete when integration tests are failing or inconclusive.
+
+   ```bash
+   # Guard: only update tracking if post-merge tests passed
+   # Timeout (124) is treated as inconclusive — do NOT mark plans complete
+   if [ "${TEST_EXIT}" -eq 0 ]; then
+     # Update ROADMAP plan progress for each completed plan in this wave
+     for plan_id in {completed_plan_ids}; do
+       node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" roadmap update-plan-progress "${PHASE_NUMBER}" "${plan_id}" "complete"
+     done
+
+     # Only commit tracking files if they actually changed
+     if ! git diff --quiet .planning/ROADMAP.md .planning/STATE.md 2>/dev/null; then
+       node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" commit "docs(phase-${PHASE_NUMBER}): update tracking after wave ${N}" --files .planning/ROADMAP.md .planning/STATE.md
+     fi
+   elif [ "${TEST_EXIT}" -eq 124 ]; then
+     echo "⚠ Skipping tracking update — test suite timed out. Plans remain in-progress. Run tests manually to confirm."
+   else
+     echo "⚠ Skipping tracking update — post-merge tests failed (exit ${TEST_EXIT}). Plans remain in-progress until tests pass."
+   fi
   ```

   Where `WAVE_PLAN_IDS` is the space-separated list of plan IDs that completed in this wave.

   **If `workflow.use_worktrees` is `false`:** Sequential agents already updated STATE.md and ROADMAP.md themselves — skip this step.

+5.8. **Handle test gate failures (when `WAVE_FAILURE_COUNT > 0`):**
+
+   ```
+   ## ⚠ Post-Merge Test Failure (cumulative failures: ${WAVE_FAILURE_COUNT})
+
+   Wave {N} worktrees merged successfully, but {M} tests fail after merge.
+   This typically indicates conflicting changes across parallel plans
+   (e.g., type definitions, shared imports, API contracts).
+
+   Failed tests:
+   {first 10 lines of failure output}
+
+   Options:
+   1. Fix now (recommended) — resolve conflicts before next wave
+   2. Continue — failures may compound in subsequent waves
+   ```
+
+   Note: If `WAVE_FAILURE_COUNT > 1`, strongly recommend "Fix now" — compounding
+   failures across multiple waves become exponentially harder to diagnose.
+
+   If "Fix now": diagnose failures (typically import conflicts, missing types,
+   or changed function signatures from parallel plans modifying the same module).
+   Fix, commit as `fix: resolve post-merge conflicts from wave {N}`, re-run tests.
+
+   **Why this matters:** Worktree isolation means each agent's Self-Check passes
+   in isolation. But when merged, add/add conflicts in shared files (models, registries,
+   CLI entry points) can silently drop code. The post-merge gate catches this before
+   the next wave builds on a broken foundation.
+
 6. **Report completion — spot-check claims first:**

   For each SUMMARY.md:
@@ -848,10 +961,11 @@ Collect all unique test file paths into `REGRESSION_FILES`.
 ```bash
 # Detect test runner and run prior phase tests
 if [ -f "package.json" ]; then
-  # Node.js — use project's test runner
-  npx jest ${REGRESSION_FILES} --passWithNoTests --no-coverage -q 2>&1 || npx vitest run ${REGRESSION_FILES} 2>&1
+  npm test 2>&1
 elif [ -f "Cargo.toml" ]; then
  cargo test 2>&1
+elif [ -f "go.mod" ]; then
+  go test ./... 2>&1
 elif [ -f "requirements.txt" ] || [ -f "pyproject.toml" ]; then
  python -m pytest ${REGRESSION_FILES} -q --tb=short 2>&1
 fi
@@ -1213,13 +1327,32 @@ Read and follow `~/.claude/get-shit-done/workflows/transition.md`, passing throu

 **IMPORTANT: There is NO `/gsd-transition` command. Never suggest it. The transition workflow is internal only.**

+Check whether CONTEXT.md already exists for the next phase:
+
+```bash
+ls .planning/phases/*{next}*/{next}-CONTEXT.md 2>/dev/null || echo "no-context"
+```
+
+If CONTEXT.md does **not** exist for the next phase, present:
+
 ```
 ## ✓ Phase {X}: {Name} Complete

 /gsd-progress ${GSD_WS} — see updated roadmap
-/gsd-discuss-phase {next} ${GSD_WS} — discuss next phase before planning
-/gsd-plan-phase {next} ${GSD_WS} — plan next phase
-/gsd-execute-phase {next} ${GSD_WS} — execute next phase
+/gsd-discuss-phase {next} ${GSD_WS} — start here: discuss next phase before planning  ← recommended
+/gsd-plan-phase {next} ${GSD_WS} — plan next phase (skip discuss)
+/gsd-execute-phase {next} ${GSD_WS} — execute next phase (skip discuss and plan)
+```
+
+If CONTEXT.md **exists** for the next phase, present:
+
+```
+## ✓ Phase {X}: {Name} Complete
+
+/gsd-progress ${GSD_WS} — see updated roadmap
+/gsd-plan-phase {next} ${GSD_WS} — start here: plan next phase (CONTEXT.md already present)  ← recommended
+/gsd-discuss-phase {next} ${GSD_WS} — re-discuss next phase
+/gsd-execute-phase {next} ${GSD_WS} — execute next phase (skip planning)
 ```

 Only suggest the commands listed above. Do not invent or hallucinate command names.
--- a/get-shit-done/workflows/execute-plan.md
+++ b/get-shit-done/workflows/execute-plan.md
@@ -72,7 +72,7 @@ grep -n "type=\"checkpoint" .planning/phases/XX-name/{phase}-{plan}-PLAN.md
 | Verify-only | B (segmented) | Segments between checkpoints. After none/human-verify → SUBAGENT. After decision/human-action → MAIN |
 | Decision | C (main) | Execute entirely in main context |

-**Pattern A:** init_agent_tracking → capture `EXPECTED_BASE=$(git rev-parse HEAD)` → spawn Task(subagent_type="gsd-executor", model=executor_model) with prompt: execute plan at [path], autonomous, all tasks + SUMMARY + commit, follow deviation/auth rules, report: plan name, tasks, SUMMARY path, commit hash → track agent_id → wait → update tracking → report. **Include `isolation="worktree"` only if `workflow.use_worktrees` is not `false`** (read via `config-get workflow.use_worktrees`). **When using `isolation="worktree"`, include a `<worktree_branch_check>` block in the prompt** instructing the executor to run `git merge-base HEAD {EXPECTED_BASE}` and, if the result differs from `{EXPECTED_BASE}`, reset the branch base with `git reset --soft {EXPECTED_BASE}` before starting work. This corrects a known issue on Windows where `EnterWorktree` creates branches from `main` instead of the feature branch HEAD.
+**Pattern A:** init_agent_tracking → capture `EXPECTED_BASE=$(git rev-parse HEAD)` → spawn Task(subagent_type="gsd-executor", model=executor_model) with prompt: execute plan at [path], autonomous, all tasks + SUMMARY + commit, follow deviation/auth rules, report: plan name, tasks, SUMMARY path, commit hash → track agent_id → wait → update tracking → report. **Include `isolation="worktree"` only if `workflow.use_worktrees` is not `false`** (read via `config-get workflow.use_worktrees`). **When using `isolation="worktree"`, include a `<worktree_branch_check>` block in the prompt** instructing the executor to run `git merge-base HEAD {EXPECTED_BASE}` and, if the result differs from `{EXPECTED_BASE}`, hard-reset the branch with `git reset --hard {EXPECTED_BASE}` before starting work (safe — runs before any agent work), then verify with `[ "$(git rev-parse HEAD)" != "{EXPECTED_BASE}" ] && exit 1`. This corrects a known issue where `EnterWorktree` creates branches from `main` instead of the feature branch HEAD (affects all platforms).

 **Pattern B:** Execute segment-by-segment. Autonomous segments: spawn subagent for assigned tasks only (no SUMMARY/commit). Checkpoints: main context. After all segments: aggregate, create SUMMARY, commit. See segment_execution.

@@ -110,6 +110,8 @@ Pattern B only (verify-only checkpoints). Skip for A/C.
 3. After ALL segments: aggregate files/deviations/decisions → create SUMMARY.md → commit → self-check:
   - Verify key-files.created exist on disk with `[ -f ]`
   - Check `git log --oneline --all --grep="{phase}-{plan}"` returns ≥1 commit
+   - Re-run ALL `<acceptance_criteria>` from every task — if any fail, fix before finalizing SUMMARY
+   - Re-run the plan-level `<verification>` commands — log results in SUMMARY
   - Append `## Self-Check: PASSED` or `## Self-Check: FAILED` to SUMMARY

   **Known Claude Code bug (classifyHandoffIfNeeded):** If any segment agent reports "failed" with `classifyHandoffIfNeeded is not defined`, this is a Claude Code runtime bug — not a real failure. Run spot-checks; if they pass, treat as successful.
@@ -133,6 +135,8 @@ This IS the execution instructions. Follow exactly. If plan references CONTEXT.m
 node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" phases list --type summaries --raw
 # Extract the second-to-last summary from the JSON result
 ```
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 If previous SUMMARY has unresolved "Issues Encountered" or "Next Phase Readiness" blockers: AskUserQuestion(header="Previous Issues", options: "Proceed anyway" | "Address first" | "Review previous").
 </step>

@@ -145,7 +149,13 @@ Deviations are normal — handle via rules below.
   - **MANDATORY read_first gate:** If the task has a `<read_first>` field, you MUST read every listed file BEFORE making any edits. This is not optional. Do not skip files because you "already know" what's in them — read them. The read_first files establish ground truth for the task.
   - `type="auto"`: if `tdd="true"` → TDD execution. Implement with deviation rules + auth gates. Verify done criteria. Commit (see task_commit). Track hash for Summary.
   - `type="checkpoint:*"`: STOP → checkpoint_protocol → wait for user → continue only after confirmation.
-   - **MANDATORY acceptance_criteria check:** After completing each task, if it has `<acceptance_criteria>`, verify EVERY criterion before moving to the next task. Use grep, file reads, or CLI commands to confirm each criterion. If any criterion fails, fix the implementation before proceeding. Do not skip criteria or mark them as "will verify later".
+   - **HARD GATE — acceptance_criteria verification:** After completing each task, if it has `<acceptance_criteria>`, you MUST run a verification loop before proceeding:
+     1. For each criterion: execute the grep, file check, or CLI command that proves it passes
+     2. Log each result as PASS or FAIL with the command output
+     3. If ANY criterion fails: fix the implementation immediately, then re-run ALL criteria
+     4. Repeat until all criteria pass — you are BLOCKED from starting the next task until this gate clears
+     5. If a criterion cannot be satisfied after 2 fix attempts, log it as a deviation with reason — do NOT silently skip it
+     This is not advisory. A task with failing acceptance criteria is an incomplete task.
 3. Run `<verification>` checks
 4. Confirm `<success_criteria>` met
 5. Document deviations in Summary
@@ -396,19 +406,29 @@ Next: more plans → "Ready for {next-plan}" | last → "Phase complete, ready f
 </step>

 <step name="update_current_position">
+**Skip this step if running in parallel mode** (the orchestrator in execute-phase.md
+handles STATE.md/ROADMAP.md updates centrally after merging worktrees to avoid
+merge conflicts).
+
 Update STATE.md using gsd-tools:

 ```bash
-# Advance plan counter (handles last-plan edge case)
-node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" state advance-plan
+# Auto-detect parallel mode: .git is a file in worktrees, a directory in main repo
+IS_WORKTREE=$([ -f .git ] && echo "true" || echo "false")

-# Recalculate progress bar from disk state
-node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" state update-progress
+# Skip in parallel mode — orchestrator handles STATE.md centrally
+if [ "$IS_WORKTREE" != "true" ]; then
+  # Advance plan counter (handles last-plan edge case)
+  node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" state advance-plan

-# Record execution metrics
-node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" state record-metric \
-  --phase "${PHASE}" --plan "${PLAN}" --duration "${DURATION}" \
-  --tasks "${TASK_COUNT}" --files "${FILE_COUNT}"
+  # Recalculate progress bar from disk state
+  node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" state update-progress
+
+  # Record execution metrics
+  node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" state record-metric \
+    --phase "${PHASE}" --plan "${PLAN}" --duration "${DURATION}" \
+    --tasks "${TASK_COUNT}" --files "${FILE_COUNT}"
+fi
 ```
 </step>

@@ -443,8 +463,17 @@ If SUMMARY "Issues Encountered" ≠ "None": yolo → log and continue. Interacti
 </step>

 <step name="update_roadmap">
+**Skip this step if running in parallel mode** (the orchestrator handles ROADMAP.md
+updates centrally after merging worktrees).
+
 ```bash
-node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" roadmap update-plan-progress "${PHASE}"
+# Auto-detect parallel mode: .git is a file in worktrees, a directory in main repo
+IS_WORKTREE=$([ -f .git ] && echo "true" || echo "false")
+
+# Skip in parallel mode — orchestrator handles ROADMAP.md centrally
+if [ "$IS_WORKTREE" != "true" ]; then
+  node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" roadmap update-plan-progress "${PHASE}"
+fi
 ```
 Counts PLAN vs SUMMARY files on disk. Updates progress table row with correct count and status (`In Progress` or `Complete` with date).
 </step>
@@ -463,7 +492,15 @@ Extract requirement IDs from the plan's frontmatter (e.g., `requirements: [AUTH-
 Task code already committed per-task. Commit plan metadata:

 ```bash
-node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" commit "docs({phase}-{plan}): complete [plan-name] plan" --files .planning/phases/XX-name/{phase}-{plan}-SUMMARY.md .planning/STATE.md .planning/ROADMAP.md .planning/REQUIREMENTS.md
+# Auto-detect parallel mode: .git is a file in worktrees, a directory in main repo
+IS_WORKTREE=$([ -f .git ] && echo "true" || echo "false")
+
+# In parallel mode: exclude STATE.md and ROADMAP.md (orchestrator commits these)
+if [ "$IS_WORKTREE" = "true" ]; then
+  node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" commit "docs({phase}-{plan}): complete [plan-name] plan" --files .planning/phases/XX-name/{phase}-{plan}-SUMMARY.md .planning/REQUIREMENTS.md
+else
+  node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" commit "docs({phase}-{plan}): complete [plan-name] plan" --files .planning/phases/XX-name/{phase}-{plan}-SUMMARY.md .planning/STATE.md .planning/ROADMAP.md .planning/REQUIREMENTS.md
+fi
 ```
 </step>

@@ -507,8 +544,8 @@ All routes: `/clear` first for fresh context.
 - All verifications pass
 - USER-SETUP.md generated if user_setup in frontmatter
 - SUMMARY.md created with substantive content
- STATE.md updated (position, decisions, issues, session)
- ROADMAP.md updated
+- STATE.md updated (position, decisions, issues, session) — unless parallel mode (orchestrator handles)
+- ROADMAP.md updated — unless parallel mode (orchestrator handles)
 - If codebase map exists: map updated with execution changes (or skipped if no significant changes)
 - If USER-SETUP.md created: prominently surfaced in completion output
 </success_criteria>
--- a/get-shit-done/workflows/import.md
+++ b/get-shit-done/workflows/import.md
@@ -164,6 +164,8 @@ Exit WITHOUT writing any files. This is the safety gate — no PLAN.md is writte

 **If only WARNINGS and/or INFO (no blockers):**

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Ask via AskUserQuestion using the approve-revise-abort pattern:
 - question: "Review the warnings above. Proceed with import?"
 - header: "Approve?"
--- a/get-shit-done/workflows/inbox.md
+++ b/get-shit-done/workflows/inbox.md
@@ -326,6 +326,9 @@ gh pr close {number} --comment "Closed by GSD inbox triage: this PR does not mee
 ```

 Always confirm with the user before closing anything:
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
+
 ```
 AskUserQuestion:
  question: "Found {N} items to close. Review the list above — proceed with closing?"
--- a/get-shit-done/workflows/manager.md
+++ b/get-shit-done/workflows/manager.md
@@ -117,6 +117,8 @@ All {phase_count} phases done. Ready for final steps:
  → /gsd-complete-milestone — archive and wrap up
 ```

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Ask user via AskUserQuestion:
 - **question:** "All phases complete. What next?"
 - **options:** "Verify work" / "Complete milestone" / "Exit manager"
--- a/get-shit-done/workflows/new-milestone.md
+++ b/get-shit-done/workflows/new-milestone.md
@@ -40,6 +40,8 @@ If the flag is absent, keep the current behavior of continuing phase numbering f

 **If no context file:**
 - Present what shipped in last milestone
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 - Ask inline (freeform, NOT AskUserQuestion): "What do you want to build next?"
 - Wait for their response, then use AskUserQuestion to probe specifics
 - If user selects "Other" at any point to provide freeform input, ask follow-up as plain text — not another AskUserQuestion
--- a/get-shit-done/workflows/new-project.md
+++ b/get-shit-done/workflows/new-project.md
@@ -103,6 +103,8 @@ git init

 **If `needs_codebase_map` is true** (from init — existing code detected but no codebase map):

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion:

 - header: "Codebase"
--- a/get-shit-done/workflows/new-workspace.md
+++ b/get-shit-done/workflows/new-workspace.md
@@ -31,6 +31,8 @@ Extract from $ARGUMENTS:

 **If `--name` is missing and not `--auto`:**

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion:
 - header: "Workspace Name"
 - question: "What should this workspace be called?"
--- a/get-shit-done/workflows/note.md
+++ b/get-shit-done/workflows/note.md
@@ -1,5 +1,7 @@
 <purpose>
 Zero-friction idea capture. One Write call, one confirmation line. No questions, no prompts.
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Runs inline — no Task, no AskUserQuestion, no Bash.
 </purpose>

--- a/get-shit-done/workflows/plan-phase.md
+++ b/get-shit-done/workflows/plan-phase.md
@@ -9,6 +9,7 @@ Read all files referenced by the invoking prompt's execution_context before star
@~/.claude/get-shit-done/references/revision-loop.md
@~/.claude/get-shit-done/references/gate-prompts.md
@~/.claude/get-shit-done/references/agent-contracts.md
+@~/.claude/get-shit-done/references/gates.md
 </required_reading>

 <available_agent_types>
@@ -45,7 +46,7 @@ Parse JSON for: `researcher_model`, `planner_model`, `checker_model`, `research_

 ## 2. Parse and Normalize Arguments

-Extract from $ARGUMENTS: phase number (integer or decimal like `2.1`), flags (`--research`, `--skip-research`, `--gaps`, `--skip-verify`, `--prd <filepath>`, `--reviews`, `--text`).
+Extract from $ARGUMENTS: phase number (integer or decimal like `2.1`), flags (`--research`, `--skip-research`, `--gaps`, `--skip-verify`, `--skip-ui`, `--prd <filepath>`, `--reviews`, `--text`).

 Set `TEXT_MODE=true` if `--text` is present in $ARGUMENTS OR `text_mode` from init JSON is `true`. When `TEXT_MODE` is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for Claude Code remote sessions (`/rc` mode) where TUI menus don't work through the Claude App.

@@ -240,6 +241,46 @@ If "Run discuss-phase first":
  ```
  **Exit the plan-phase workflow. Do not continue.**

+## 4.5. Check AI-SPEC
+
+**Skip if:** `ai_integration_phase_enabled` from config is false, or `--skip-ai-spec` flag provided.
+
+```bash
+AI_SPEC_FILE=$(ls "${PHASE_DIR}"/*-AI-SPEC.md 2>/dev/null | head -1)
+AI_PHASE_CFG=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" config-get workflow.ai_integration_phase 2>/dev/null || echo "true")
+```
+
+**Skip if `AI_PHASE_CFG` is `false`.**
+
+**If `AI_SPEC_FILE` is empty:** Check phase goal for AI keywords:
+```bash
+echo "${phase_goal}" | grep -qi "agent\|llm\|rag\|chatbot\|embedding\|langchain\|llamaindex\|crewai\|langgraph\|openai\|anthropic\|vector\|eval\|ai system"
+```
+
+**If AI keywords detected AND no AI-SPEC.md:**
+```
+◆ Note: This phase appears to involve AI system development.
+  Consider running /gsd-ai-integration-phase {N} before planning to:
+  - Select the right framework for your use case
+  - Research its docs and best practices
+  - Design an evaluation strategy
+
+  Continue planning without AI-SPEC? (non-blocking — /gsd-ai-integration-phase can be run after)
+```
+
+Use AskUserQuestion with options:
+- "Continue — plan without AI-SPEC"
+- "Stop — I'll run /gsd-ai-integration-phase {N} first"
+
+If "Stop": Exit with `/gsd-ai-integration-phase {N}` reminder.
+If "Continue": Proceed. (Non-blocking — planner will note AI-SPEC is absent.)
+
+**If `AI_SPEC_FILE` is non-empty:** Extract framework for planner context:
+```bash
+FRAMEWORK_LINE=$(grep "Selected Framework:" "${AI_SPEC_FILE}" | head -1)
+```
+Pass `ai_spec_path` and `framework_line` to planner in step 7 so it can reference the AI design contract.
+
 ## 5. Handle Research

 **Skip if:** `--gaps` flag or `--skip-research` flag or `--reviews` flag.
@@ -422,6 +463,8 @@ UI_SPEC_FILE=$(ls "${PHASE_DIR}"/*-UI-SPEC.md 2>/dev/null | head -1)

 **If UI-SPEC.md found:** Set `UI_SPEC_PATH=$UI_SPEC_FILE`. Display: `Using UI design contract: ${UI_SPEC_PATH}`

+**If UI-SPEC.md missing AND `--skip-ui` flag is present in $ARGUMENTS:** Skip silently to step 6.
+
 **If UI-SPEC.md missing AND `UI_GATE_CFG` is `true`:**

 Read auto-chain state:
@@ -444,24 +487,18 @@ Continue to step 6.

 **If `AUTO_CHAIN` is `false` (manual invocation):**

-If `TEXT_MODE` is true, present as a plain-text numbered list:
+Output this markdown directly (not as a code block):
+
 ```
-Phase {N} has frontend indicators but no UI-SPEC.md. Generate a design contract before planning?
-
-1. Generate UI-SPEC first — Run /gsd-ui-phase {N} then re-run /gsd-plan-phase {N}
-2. Continue without UI-SPEC
-3. Not a frontend phase
-
-Enter number:
+## ⚠ UI-SPEC.md missing for Phase {N}
+▶ Recommended next step:
+`/gsd-ui-phase {N} ${GSD_WS}` — generate UI design contract before planning
+───────────────────────────────────────────────
+Also available:
+- `/gsd-plan-phase {N} --skip-ui ${GSD_WS}` — plan without UI-SPEC (not recommended for frontend phases)
 ```

-Otherwise use AskUserQuestion:
- header: "UI Design Contract"
- question: "Phase {N} has frontend indicators but no UI-SPEC.md. Generate a design contract before planning?"
- options:
-  - "Generate UI-SPEC first" → Display: "Run `/gsd-ui-phase {N} ${GSD_WS}` then re-run `/gsd-plan-phase {N} ${GSD_WS}`". Exit workflow.
-  - "Continue without UI-SPEC" → Continue to step 6.
-  - "Not a frontend phase" → Continue to step 6.
+**Exit the plan-phase workflow. Do not continue.**

 **If `HAS_UI` is 1 (no frontend indicators):** Skip silently to step 5.7.

--- a/get-shit-done/workflows/plant-seed.md
+++ b/get-shit-done/workflows/plant-seed.md
@@ -32,6 +32,9 @@ mkdir -p .planning/seeds
 <step name="gather_context">
 Ask focused questions to build a complete seed:

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
+
 ```
 AskUserQuestion(
  header: "Trigger",
--- a/get-shit-done/workflows/pr-branch.md
+++ b/get-shit-done/workflows/pr-branch.md
@@ -1,7 +1,8 @@
 <purpose>
-Create a clean branch for pull requests by filtering out .planning/ commits.
-The PR branch contains only code changes — reviewers don't see GSD artifacts
-(PLAN.md, SUMMARY.md, STATE.md, CONTEXT.md, etc.).
+Create a clean branch for pull requests by filtering out transient .planning/ commits.
+The PR branch contains only code changes and structural planning state — reviewers
+don't see GSD transient artifacts (PLAN.md, SUMMARY.md, CONTEXT.md, RESEARCH.md, etc.)
+but milestone archives, STATE.md, ROADMAP.md, and PROJECT.md changes are preserved.

 Uses git cherry-pick with path filtering to rebuild a clean history.
 </purpose>
@@ -48,24 +49,47 @@ Classify commits:
 git log --oneline "$TARGET".."$CURRENT_BRANCH" --no-merges
 ```

-For each commit, check if it ONLY touches .planning/ files:
+**Structural planning files** — always preserved (repository planning state):
+- `.planning/STATE.md`
+- `.planning/ROADMAP.md`
+- `.planning/MILESTONES.md`
+- `.planning/PROJECT.md`
+- `.planning/REQUIREMENTS.md`
+- `.planning/milestones/**`
+
+**Transient planning files** — excluded from PR branch (reviewer noise):
+- `.planning/phases/**` (PLAN.md, SUMMARY.md, CONTEXT.md, RESEARCH.md, etc.)
+- `.planning/quick/**`
+- `.planning/research/**`
+- `.planning/threads/**`
+- `.planning/todos/**`
+- `.planning/debug/**`
+- `.planning/seeds/**`
+- `.planning/codebase/**`
+- `.planning/ui-reviews/**`
+
+For each commit, check what it touches:

 ```bash
 # For each commit hash
 FILES=$(git diff-tree --no-commit-id --name-only -r $HASH)
-ALL_PLANNING=$(echo "$FILES" | grep -v "^\.planning/" | wc -l)
+NON_PLANNING=$(echo "$FILES" | grep -v "^\.planning/" | wc -l)
+STRUCTURAL=$(echo "$FILES" | grep -E "^\.planning/(STATE|ROADMAP|MILESTONES|PROJECT|REQUIREMENTS)\.md|^\.planning/milestones/" | wc -l)
+TRANSIENT_ONLY=$(echo "$FILES" | grep "^\.planning/" | grep -vE "^\.planning/(STATE|ROADMAP|MILESTONES|PROJECT|REQUIREMENTS)\.md|^\.planning/milestones/" | wc -l)
 ```

 Classify:
 - **Code commits**: Touch at least one non-.planning/ file → INCLUDE
- **Planning-only commits**: Touch only .planning/ files → EXCLUDE
- **Mixed commits**: Touch both → INCLUDE (planning changes come along)
+- **Structural planning commits**: Touch only structural .planning/ files (STATE.md, ROADMAP.md, MILESTONES.md, PROJECT.md, REQUIREMENTS.md, milestones/**) → INCLUDE
+- **Transient planning commits**: Touch only transient .planning/ files (phases/, quick/, research/, etc.) → EXCLUDE
+- **Mixed commits**: Touch code + any planning files → INCLUDE (transient planning changes come along; acceptable in mixed context)

 Display analysis:
 ```
-Commits to include: {N} (code changes)
-Commits to exclude: {N} (planning-only)
+Commits to include: {N} (code changes + structural planning)
+Commits to exclude: {N} (transient planning-only)
 Mixed commits: {N} (code + planning — included)
+Structural planning commits: {N} (STATE/ROADMAP/milestone updates — included)
 ```
 </step>

@@ -77,13 +101,17 @@ PR_BRANCH="${CURRENT_BRANCH}-pr"
 git checkout -b "$PR_BRANCH" "$TARGET"
 ```

-Cherry-pick only code commits (in order):
+Cherry-pick code commits and structural planning commits (in order):

 ```bash
-for HASH in $CODE_COMMITS; do
+for HASH in $CODE_AND_STRUCTURAL_COMMITS; do
  git cherry-pick "$HASH" --no-commit
-  # Remove any .planning/ files that came along in mixed commits
-  git rm -r --cached .planning/ 2>/dev/null || true
+  # Remove only transient .planning/ subdirectories that came along in mixed commits.
+  # DO NOT remove structural files (STATE.md, ROADMAP.md, MILESTONES.md, PROJECT.md,
+  # REQUIREMENTS.md, milestones/) — these must survive into the PR branch.
+  for dir in phases quick research threads todos debug seeds codebase ui-reviews; do
+    git rm -r --cached ".planning/$dir/" 2>/dev/null || true
+  done
  git commit -C "$HASH"
 done
 ```
--- a/get-shit-done/workflows/profile-user.md
+++ b/get-shit-done/workflows/profile-user.md
@@ -30,6 +30,8 @@ PROFILE_PATH="$HOME/.claude/get-shit-done/USER-PROFILE.md"

 **If profile exists AND --refresh NOT set AND --questionnaire NOT set:**

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion:
 - header: "Existing Profile"
 - question: "You already have a profile. What would you like to do?"
@@ -46,7 +48,7 @@ If "Cancel": Display "No changes made." and exit.

 Backup existing profile:
 ```bash
-cp "$HOME/.claude/get-shit-done/USER-PROFILE.md" "$HOME/.claude/get-shit-done/USER-PROFILE.backup.md"
+cp "$HOME/.claude/get-shit-done/USER-PROFILE.md" "$HOME/.claude/USER-PROFILE.backup.md"
 ```

 Display: "Re-analyzing your sessions to update your profile."
@@ -381,7 +383,7 @@ Read both old backup and new analysis to compare dimension ratings/confidence.

 Read the backed-up profile:
 ```bash
-BACKUP_PATH="$HOME/.claude/get-shit-done/USER-PROFILE.backup.md"
+BACKUP_PATH="$HOME/.claude/USER-PROFILE.backup.md"
 ```

 Compare each dimension's rating and confidence between old and new. Display diff table showing only changed dimensions:
--- a/get-shit-done/workflows/quick.md
+++ b/get-shit-done/workflows/quick.md
@@ -38,6 +38,9 @@ Parse `$ARGUMENTS` for:

 If `$DESCRIPTION` is empty after parsing, prompt user interactively:

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
+
 ```
 AskUserQuestion(
  header: "Quick Task",
@@ -567,8 +570,10 @@ ${USE_WORKTREES !== "false" ? `
 <worktree_branch_check>
 FIRST ACTION before any other work: verify this worktree branch is based on the correct commit.
 Run: git merge-base HEAD ${EXPECTED_BASE}
-If the result differs from ${EXPECTED_BASE}, run: git reset --soft ${EXPECTED_BASE}
-This corrects a known issue on Windows where EnterWorktree creates branches from main instead of the feature branch HEAD.
+If the result differs from ${EXPECTED_BASE}, hard-reset to the correct base (safe — runs before any agent work):
+  git reset --hard ${EXPECTED_BASE}
+Then verify: if [ "$(git rev-parse HEAD)" != "${EXPECTED_BASE}" ]; then echo "ERROR: Could not correct worktree base"; exit 1; fi
+This corrects a known issue where EnterWorktree creates branches from main instead of the feature branch HEAD (affects all platforms).
 </worktree_branch_check>
 ` : ''}

--- a/get-shit-done/workflows/remove-workspace.md
+++ b/get-shit-done/workflows/remove-workspace.md
@@ -23,6 +23,8 @@ Parse JSON for: `workspace_name`, `workspace_path`, `has_manifest`, `strategy`,

 First run `/gsd-list-workspaces` to show available workspaces, then ask:

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion:
 - header: "Remove Workspace"
 - question: "Which workspace do you want to remove?"
--- a/get-shit-done/workflows/review.md
+++ b/get-shit-done/workflows/review.md
@@ -139,26 +139,47 @@ Write to a temp file: `/tmp/gsd-review-prompt-{phase}.md`
 </step>

 <step name="invoke_reviewers">
+Read model preferences from planning config. Null/missing values fall back to CLI defaults.
+
+```bash
+GEMINI_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" config-get review.models.gemini --raw 2>/dev/null || true)
+CLAUDE_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" config-get review.models.claude --raw 2>/dev/null || true)
+CODEX_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" config-get review.models.codex --raw 2>/dev/null || true)
+OPENCODE_MODEL=$(node "$HOME/.claude/get-shit-done/bin/gsd-tools.cjs" config-get review.models.opencode --raw 2>/dev/null || true)
+```
+
 For each selected CLI, invoke in sequence (not parallel — avoid rate limits):

 **Gemini:**
 ```bash
-gemini -p "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-gemini-{phase}.md
+if [ -n "$GEMINI_MODEL" ] && [ "$GEMINI_MODEL" != "null" ]; then
+  gemini -m "$GEMINI_MODEL" -p "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-gemini-{phase}.md
+else
+  gemini -p "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-gemini-{phase}.md
+fi
 ```

 **Claude (separate session):**
 ```bash
-claude -p "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-claude-{phase}.md
+if [ -n "$CLAUDE_MODEL" ] && [ "$CLAUDE_MODEL" != "null" ]; then
+  claude --model "$CLAUDE_MODEL" -p "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-claude-{phase}.md
+else
+  claude -p "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-claude-{phase}.md
+fi
 ```

 **Codex:**
 ```bash
-codex exec --skip-git-repo-check "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-codex-{phase}.md
+if [ -n "$CODEX_MODEL" ] && [ "$CODEX_MODEL" != "null" ]; then
+  codex exec --model "$CODEX_MODEL" --skip-git-repo-check "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-codex-{phase}.md
+else
+  codex exec --skip-git-repo-check "$(cat /tmp/gsd-review-prompt-{phase}.md)" 2>/dev/null > /tmp/gsd-review-codex-{phase}.md
+fi
 ```

 **CodeRabbit:**

-Note: CodeRabbit reviews the current git diff/working tree — it does not accept a prompt. It may take up to 5 minutes. Use `timeout: 360000` on the Bash tool call.
+Note: CodeRabbit reviews the current git diff/working tree — it does not accept a prompt or model flag. It may take up to 5 minutes. Use `timeout: 360000` on the Bash tool call.

 ```bash
 coderabbit review --prompt-only 2>/dev/null > /tmp/gsd-review-coderabbit-{phase}.md
@@ -166,7 +187,11 @@ coderabbit review --prompt-only 2>/dev/null > /tmp/gsd-review-coderabbit-{phase}

 **OpenCode (via GitHub Copilot):**
 ```bash
-cat /tmp/gsd-review-prompt-{phase}.md | opencode run - 2>/dev/null > /tmp/gsd-review-opencode-{phase}.md
+if [ -n "$OPENCODE_MODEL" ] && [ "$OPENCODE_MODEL" != "null" ]; then
+  cat /tmp/gsd-review-prompt-{phase}.md | opencode run --model "$OPENCODE_MODEL" - 2>/dev/null > /tmp/gsd-review-opencode-{phase}.md
+else
+  cat /tmp/gsd-review-prompt-{phase}.md | opencode run - 2>/dev/null > /tmp/gsd-review-opencode-{phase}.md
+fi
 if [ ! -s /tmp/gsd-review-opencode-{phase}.md ]; then
  echo "OpenCode review failed or returned empty output." > /tmp/gsd-review-opencode-{phase}.md
 fi
--- a/get-shit-done/workflows/secure-phase.md
+++ b/get-shit-done/workflows/secure-phase.md
@@ -73,6 +73,8 @@ If `threats_open: 0` → skip to Step 6 directly.

 ## 4. Present Threat Plan

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Call AskUserQuestion with threat table and options:
 1. "Verify all open threats" → Step 5
 2. "Accept all open — document in accepted risks log" → add to SECURITY.md accepted risks, set all CLOSED, Step 6
--- a/get-shit-done/workflows/settings.md
+++ b/get-shit-done/workflows/settings.md
@@ -32,12 +32,15 @@ Parse current values (default to `true` if not present):
 - `workflow.nyquist_validation` — validation architecture research during plan-phase (default: true if absent)
 - `workflow.ui_phase` — generate UI-SPEC.md design contracts for frontend phases (default: true if absent)
 - `workflow.ui_safety_gate` — prompt to run /gsd-ui-phase before planning frontend phases (default: true if absent)
+- `workflow.ai_integration_phase` — framework selection + eval strategy for AI phases (default: true if absent)
 - `model_profile` — which model each agent uses (default: `balanced`)
 - `git.branching_strategy` — branching approach (default: `"none"`)
 - `workflow.use_worktrees` — whether parallel executor agents run in worktree isolation (default: `true`)
 </step>

 <step name="present_settings">
+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion with current values pre-selected:

 ```
@@ -118,6 +121,15 @@ AskUserQuestion([
      { label: "No", description: "No prompt — plan-phase proceeds without UI-SPEC check." }
    ]
  },
+  {
+    question: "Enable AI Phase? (framework selection + eval strategy for AI phases)",
+    header: "AI Phase",
+    multiSelect: false,
+    options: [
+      { label: "Yes (Recommended)", description: "Run /gsd-ai-phase before planning AI system phases. Surfaces the right framework, researches its docs, and designs the evaluation strategy." },
+      { label: "No", description: "Skip AI design contract. Good for non-AI phases or when framework is already decided." }
+    ]
+  },
  {
    question: "Git branching strategy?",
    header: "Branching",
@@ -161,7 +173,7 @@ AskUserQuestion([
    multiSelect: false,
    options: [
      { label: "Yes (Recommended)", description: "Each parallel executor runs in its own worktree branch — no conflicts between agents." },
-      { label: "No", description: "Disable worktree isolation. Use on platforms where EnterWorktree is broken (e.g. Windows with feature branches). Agents run sequentially on the main working tree." }
+      { label: "No", description: "Disable worktree isolation. Agents run sequentially on the main working tree. Use if EnterWorktree creates branches from wrong base (known cross-platform issue)." }
    ]
  }
 ])
@@ -183,6 +195,7 @@ Merge new settings into existing config.json:
    "nyquist_validation": true/false,
    "ui_phase": true/false,
    "ui_safety_gate": true/false,
+    "ai_integration_phase": true/false,
    "text_mode": true/false,
    "research_before_questions": true/false,
    "discuss_mode": "discuss" | "assumptions",
@@ -244,6 +257,7 @@ Write `~/.gsd/defaults.json` with:
    "nyquist_validation": <current>,
    "ui_phase": <current>,
    "ui_safety_gate": <current>,
+    "ai_integration_phase": <current>,
    "skip_discuss": <current>
  }
 }
@@ -268,6 +282,7 @@ Display:
 | Nyquist Validation   | {On/Off} |
 | UI Phase             | {On/Off} |
 | UI Safety Gate       | {On/Off} |
+| AI Integration Phase | {On/Off} |
 | Git Branching        | {None/Per Phase/Per Milestone} |
 | Skip Discuss         | {On/Off} |
 | Context Warnings     | {On/Off} |
@@ -287,7 +302,7 @@ Quick commands:

 <success_criteria>
 - [ ] Current config read
- [ ] User presented with 10 settings (profile + 8 workflow toggles + git branching)
+- [ ] User presented with 14 settings (profile + 11 workflow toggles + git branching + ctx warnings)
 - [ ] Config updated with model_profile, workflow, and git sections
 - [ ] User offered to save as global defaults (~/.gsd/defaults.json)
 - [ ] Changes confirmed to user
--- a/get-shit-done/workflows/ship.md
+++ b/get-shit-done/workflows/ship.md
@@ -161,6 +161,9 @@ Report: "PR #{number} created: {url}"
 <step name="optional_review">
 Ask if user wants to trigger a code review:

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
+
 ```
 AskUserQuestion:
  question: "PR created. Run a code review before merge?"
--- a/get-shit-done/workflows/ui-phase.md
+++ b/get-shit-done/workflows/ui-phase.md
@@ -83,6 +83,8 @@ Continue (non-blocking).
 UI_SPEC_FILE=$(ls "${PHASE_DIR}"/*-UI-SPEC.md 2>/dev/null | head -1)
 ```

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 **If exists:** Use AskUserQuestion:
 - header: "Existing UI-SPEC"
 - question: "UI-SPEC.md already exists for Phase {N}. What would you like to do?"
@@ -261,11 +263,17 @@ Dimensions: 6/6 passed

 ## ▶ Next Up

+{If CONTEXT.md exists for this phase:}
 **Plan Phase {N}** — planner will use UI-SPEC.md as design context

-`/clear` then:
+`/clear` then: `/gsd-plan-phase {N}`

-`/gsd-plan-phase {N}`
+{If CONTEXT.md does NOT exist:}
+**Discuss Phase {N}** — gather implementation context before planning
+
+`/clear` then: `/gsd-discuss-phase {N}`
+
+(or `/gsd-plan-phase {N}` to skip discussion)

 ───────────────────────────────────────────────────────────────
 ```
--- a/get-shit-done/workflows/ui-review.md
+++ b/get-shit-done/workflows/ui-review.md
@@ -44,6 +44,8 @@ UI_REVIEW_FILE=$(ls "${PHASE_DIR}"/*-UI-REVIEW.md 2>/dev/null | head -1)

 **If `SUMMARY_FILES` empty:** Exit — "Phase {N} not executed. Run /gsd-execute-phase {N} first."

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 **If `UI_REVIEW_FILE` non-empty:** Use AskUserQuestion:
 - header: "Existing UI Review"
 - question: "UI-REVIEW.md already exists for Phase {N}."
--- a/get-shit-done/workflows/undo.md
+++ b/get-shit-done/workflows/undo.md
@@ -63,6 +63,8 @@ Recent GSD commits:
  3. ghi9012 fix(02-03): correct validation logic
 ```

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion to ask:
 - question: "Which commits to revert? Enter numbers (e.g., 1,3) or 'all'"
 - header: "Select"
--- a/get-shit-done/workflows/update.md
+++ b/get-shit-done/workflows/update.md
@@ -341,6 +341,8 @@ Your custom files in other locations are preserved:
 If you've modified any GSD files directly, they'll be automatically backed up to `gsd-local-patches/` and can be reapplied with `/gsd-reapply-patches` after the update.
 ```

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Use AskUserQuestion:
 - Question: "Proceed with update?"
 - Options:
--- a/get-shit-done/workflows/validate-phase.md
+++ b/get-shit-done/workflows/validate-phase.md
@@ -83,6 +83,8 @@ No gaps → skip to Step 6, set `nyquist_compliant: true`.

 ## 4. Present Gap Plan

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Call AskUserQuestion with gap table and options:
 1. "Fix all gaps" → Step 5
 2. "Skip — mark manual-only" → add to Manual-Only, Step 6
--- a/get-shit-done/workflows/verify-phase.md
+++ b/get-shit-done/workflows/verify-phase.md
@@ -183,6 +183,89 @@ grep -E "Phase ${PHASE_NUM}" .planning/REQUIREMENTS.md 2>/dev/null || true
 For each requirement: parse description → identify supporting truths/artifacts → status: ✓ SATISFIED / ✗ BLOCKED / ? NEEDS HUMAN.
 </step>

+<step name="behavioral_verification">
+**Run the project's test suite and CLI commands to verify behavior, not just structure.**
+
+Static checks (grep, file existence, wiring) catch structural gaps but miss runtime
+failures. This step runs actual tests and project commands to verify the phase goal
+is behaviorally achieved.
+
+This follows Anthropic's harness engineering principle: separating generation from
+evaluation, with the evaluator interacting with the running system rather than
+inspecting static artifacts.
+
+**Step 1: Run test suite**
+
+```bash
+# Detect test runner and run all tests (timeout: 5 minutes)
+TEST_EXIT=0
+timeout 300 bash -c '
+if [ -f "package.json" ]; then
+  npm test 2>&1
+elif [ -f "Cargo.toml" ]; then
+  cargo test 2>&1
+elif [ -f "go.mod" ]; then
+  go test ./... 2>&1
+elif [ -f "pyproject.toml" ] || [ -f "requirements.txt" ]; then
+  python -m pytest -q --tb=short 2>&1 || uv run python -m pytest -q --tb=short 2>&1
+else
+  echo "⚠ No test runner detected — skipping test suite"
+  exit 1
+fi
+'
+TEST_EXIT=$?
+if [ "${TEST_EXIT}" -eq 0 ]; then
+  echo "✓ Test suite passed"
+elif [ "${TEST_EXIT}" -eq 124 ]; then
+  echo "⚠ Test suite timed out after 5 minutes"
+else
+  echo "✗ Test suite failed (exit code ${TEST_EXIT})"
+fi
+```
+
+Record: total tests, passed, failed, coverage (if available).
+
+**If any tests fail:** Mark as `behavioral_failures` — these are BLOCKER severity
+regardless of whether static checks passed. A phase cannot be verified if tests fail.
+
+**Step 2: Run project CLI/commands from success criteria (if testable)**
+
+For each success criterion that describes a user command (e.g., "User can run
+`mixtiq validate`", "User can run `npm start`"):
+
+1. Check if the command exists and required inputs are available:
+   - Look for example files in `templates/`, `fixtures/`, `test/`, `examples/`, or `testdata/`
+   - Check if the CLI binary/script exists on PATH or in the project
+2. **If no suitable inputs or fixtures exist:** Mark as `? NEEDS HUMAN` with reason
+   "No test fixtures available — requires manual verification" and move on.
+   Do NOT invent example inputs.
+3. If inputs are available: run the command and verify it exits successfully.
+
+```bash
+# Only run if both command and input exist
+if command -v {project_cli} &>/dev/null && [ -f "{example_input}" ]; then
+  {project_cli} {example_input} 2>&1
+fi
+```
+
+Record: command, exit code, output summary, pass/fail (or SKIPPED if no fixtures).
+
+**Step 3: Report**
+
+```
+## Behavioral Verification
+
+| Check | Result | Detail |
+|-------|--------|--------|
+| Test suite | {N} passed, {M} failed | {first failure if any} |
+| {CLI command 1} | ✓ / ✗ | {output summary} |
+| {CLI command 2} | ✓ / ✗ | {output summary} |
+```
+
+**If all behavioral checks pass:** Continue to scan_antipatterns.
+**If any fail:** Add to verification gaps with BLOCKER severity.
+</step>
+
 <step name="scan_antipatterns">
 Extract files modified in this phase from SUMMARY.md, scan each:

--- a/get-shit-done/workflows/verify-work.md
+++ b/get-shit-done/workflows/verify-work.md
@@ -248,6 +248,8 @@ Display the returned checkpoint EXACTLY as-is:
 - Do NOT add commentary before or after the block.
 - If you notice protocol/meta markers such as `to=all:`, role-routing text, XML system tags, hidden instruction markers, ad copy, or any unrelated suffix, discard the draft and output `{CHECKPOINT}` only.

+
+**Text mode (`workflow.text_mode: true` in config or `--text` flag):** Set `TEXT_MODE=true` if `--text` is present in `$ARGUMENTS` OR `text_mode` from init JSON is `true`. When TEXT_MODE is active, replace every `AskUserQuestion` call with a plain-text numbered list and ask the user to type their choice number. This is required for non-Claude runtimes (OpenAI Codex, Gemini CLI, etc.) where `AskUserQuestion` is not available.
 Wait for user response (plain text, no AskUserQuestion).
 </step>

@@ -437,8 +439,17 @@ If `SECURITY_CFG` is `true` AND `SECURITY_FILE` exists: check frontmatter `threa
 ```

 If `SECURITY_CFG` is `false` OR (`SECURITY_FILE` exists AND `threats_open` is `0`):
+
+**Auto-transition: mark phase complete in ROADMAP.md and STATE.md**
+
+Execute the transition workflow inline (do NOT use Task — the orchestrator context already holds the UAT results and phase data needed for accurate transition):
+
+Read and follow `~/.claude/get-shit-done/workflows/transition.md`.
+
+After transition completes, present next-step options to the user:
+
 ```
-All tests passed. Ready to continue.
+All tests passed. Phase {phase} marked complete.

 - `/gsd-plan-phase {next}` — Plan next phase
 - `/gsd-execute-phase {next}` — Execute next phase
--- a/hooks/gsd-check-update.js
+++ b/hooks/gsd-check-update.js
@@ -19,7 +19,7 @@ function detectConfigDir(baseDir) {
  if (envDir && fs.existsSync(path.join(envDir, 'get-shit-done', 'VERSION'))) {
    return envDir;
  }
-  for (const dir of ['.config/kilo', '.kilo', '.config/opencode', '.opencode', '.gemini', '.claude']) {
+  for (const dir of ['.claude', '.gemini', '.config/kilo', '.kilo', '.config/opencode', '.opencode']) {
    if (fs.existsSync(path.join(baseDir, dir, 'get-shit-done', 'VERSION'))) {
      return path.join(baseDir, dir);
    }
--- a/hooks/gsd-context-monitor.js
+++ b/hooks/gsd-context-monitor.js
@@ -52,17 +52,19 @@ process.stdin.on('end', () => {
      process.exit(0);
    }

-    // Check if context warnings are disabled via config
+    // Check if context warnings are disabled via config.
+    // Quick sentinel check: skip config read entirely for non-GSD projects (#P2.5).
    const cwd = data.cwd || process.cwd();
-    const configPath = path.join(cwd, '.planning', 'config.json');
-    if (fs.existsSync(configPath)) {
+    const planningDir = path.join(cwd, '.planning');
+    if (fs.existsSync(planningDir)) {
      try {
+        const configPath = path.join(planningDir, 'config.json');
        const config = JSON.parse(fs.readFileSync(configPath, 'utf8'));
        if (config.hooks?.context_warnings === false) {
          process.exit(0);
        }
      } catch (e) {
-        // Ignore config parse errors
+        // Ignore config read/parse errors (config may not exist in .planning/)
      }
    }

--- a/hooks/gsd-read-guard.js
+++ b/hooks/gsd-read-guard.js
@@ -36,6 +36,11 @@ process.stdin.on('end', () => {
      process.exit(0);
    }

+    // Claude Code natively enforces read-before-edit — skip the advisory (#1984)
+    if (process.env.CLAUDE_SESSION_ID) {
+      process.exit(0);
+    }
+
    const filePath = data.tool_input?.file_path || '';
    if (!filePath) {
      process.exit(0);
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
  "name": "get-shit-done-cc",
-  "version": "1.30.0",
+  "version": "1.34.2",
  "lockfileVersion": 3,
  "requires": true,
  "packages": {
    "": {
      "name": "get-shit-done-cc",
-      "version": "1.30.0",
+      "version": "1.34.2",
      "license": "MIT",
      "bin": {
        "get-shit-done-cc": "bin/install.js"
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
  "name": "get-shit-done-cc",
-  "version": "1.33.0",
+  "version": "1.34.2",
  "description": "A meta-prompting, context engineering and spec-driven development system for Claude Code, OpenCode, Gemini and Codex by TÂCHES.",
  "bin": {
    "get-shit-done-cc": "bin/install.js"
@@ -10,7 +10,7 @@
    "commands",
    "get-shit-done",
    "agents",
-    "hooks/dist",
+    "hooks",
    "scripts"
  ],
  "keywords": [
--- a/tests/ai-evals.test.cjs
+++ b/tests/ai-evals.test.cjs
@@ -0,0 +1,400 @@
+/**
+ * GSD AI Evals Framework Tests
+ *
+ * Validates the /gsd-ai-integration-phase + /gsd-eval-review contribution:
+ * - workflow.ai_integration_phase key in config defaults and config-set/get
+ * - W016 validate-health warning when ai_integration_phase absent
+ * - addAiIntegrationPhaseKey repair action
+ * - AI-SPEC.md template section completeness
+ * - New agent frontmatter (picked up by agent-frontmatter.test.cjs — covered there)
+ * - plan-phase.md Step 4.5 AI-keyword nudge block
+ * - ai-integration-phase and eval-review command frontmatter
+ * - ai-evals.md and ai-frameworks.md reference files exist and are non-empty
+ */
+
+const { test, describe, beforeEach, afterEach } = require('node:test');
+const assert = require('node:assert');
+const fs = require('fs');
+const path = require('path');
+const { runGsdTools, createTempProject, cleanup } = require('./helpers.cjs');
+
+const REPO_ROOT      = path.join(__dirname, '..');
+const AGENTS_DIR     = path.join(REPO_ROOT, 'agents');
+const COMMANDS_DIR   = path.join(REPO_ROOT, 'commands', 'gsd');
+const WORKFLOWS_DIR  = path.join(REPO_ROOT, 'get-shit-done', 'workflows');
+const TEMPLATES_DIR  = path.join(REPO_ROOT, 'get-shit-done', 'templates');
+const REFERENCES_DIR = path.join(REPO_ROOT, 'get-shit-done', 'references');
+
+// ─── Helpers ─────────────────────────────────────────────────────────────────
+
+function readConfig(tmpDir) {
+  return JSON.parse(fs.readFileSync(path.join(tmpDir, '.planning', 'config.json'), 'utf-8'));
+}
+
+function writeConfig(tmpDir, obj) {
+  fs.writeFileSync(
+    path.join(tmpDir, '.planning', 'config.json'),
+    JSON.stringify(obj, null, 2),
+    'utf-8'
+  );
+}
+
+function writeMinimalHealth(tmpDir) {
+  fs.writeFileSync(path.join(tmpDir, '.planning', 'PROJECT.md'),
+    '# Project\n\n## What This Is\n\nFoo.\n\n## Core Value\n\nBar.\n\n## Requirements\n\nBaz.\n');
+  fs.writeFileSync(path.join(tmpDir, '.planning', 'ROADMAP.md'),
+    '# Roadmap\n\n### Phase 1: Setup\n');
+  fs.writeFileSync(path.join(tmpDir, '.planning', 'STATE.md'),
+    '# Session State\n\nPhase 1 in progress.\n');
+  fs.mkdirSync(path.join(tmpDir, '.planning', 'phases', '01-setup'), { recursive: true });
+}
+
+// ─── Config: workflow.ai_integration_phase default ───────────────────────────────────────
+
+describe('CONFIG: workflow.ai_integration_phase default', () => {
+  let tmpDir;
+
+  beforeEach(() => { tmpDir = createTempProject(); });
+  afterEach(() => { cleanup(tmpDir); });
+
+  test('config-ensure-section includes workflow.ai_integration_phase as boolean', () => {
+    const result = runGsdTools('config-ensure-section', tmpDir);
+    assert.ok(result.success, `Command failed: ${result.error}`);
+
+    const config = readConfig(tmpDir);
+    assert.ok(config.workflow && typeof config.workflow === 'object', 'workflow should exist');
+    assert.strictEqual(typeof config.workflow.ai_integration_phase, 'boolean', 'workflow.ai_integration_phase should be boolean');
+  });
+
+  test('workflow.ai_integration_phase defaults to true', () => {
+    runGsdTools('config-ensure-section', tmpDir);
+    const config = readConfig(tmpDir);
+    assert.strictEqual(config.workflow.ai_integration_phase, true, 'workflow.ai_integration_phase should default to true');
+  });
+});
+
+// ─── Config: config-set / config-get workflow.ai_integration_phase ───────────────────────
+
+describe('CONFIG: config-set / config-get workflow.ai_integration_phase', () => {
+  let tmpDir;
+
+  beforeEach(() => {
+    tmpDir = createTempProject();
+    runGsdTools('config-ensure-section', tmpDir);
+  });
+
+  afterEach(() => { cleanup(tmpDir); });
+
+  test('config-set workflow.ai_integration_phase false persists as boolean false', () => {
+    const result = runGsdTools('config-set workflow.ai_integration_phase false', tmpDir);
+    assert.ok(result.success, `config-set failed: ${result.error}`);
+
+    const config = readConfig(tmpDir);
+    assert.strictEqual(config.workflow.ai_integration_phase, false);
+    assert.strictEqual(typeof config.workflow.ai_integration_phase, 'boolean');
+  });
+
+  test('config-set workflow.ai_integration_phase true persists as boolean true', () => {
+    runGsdTools('config-set workflow.ai_integration_phase false', tmpDir);
+    const result = runGsdTools('config-set workflow.ai_integration_phase true', tmpDir);
+    assert.ok(result.success, `config-set failed: ${result.error}`);
+
+    const config = readConfig(tmpDir);
+    assert.strictEqual(config.workflow.ai_integration_phase, true);
+  });
+
+  test('config-get workflow.ai_integration_phase returns the stored value', () => {
+    runGsdTools('config-set workflow.ai_integration_phase false', tmpDir);
+    const result = runGsdTools('config-get workflow.ai_integration_phase', tmpDir);
+    assert.ok(result.success, `config-get failed: ${result.error}`);
+    assert.strictEqual(JSON.parse(result.output), false);
+  });
+});
+
+// ─── Validate Health: W016 ────────────────────────────────────────────────────
+
+describe('HEALTH: W016 — workflow.ai_integration_phase absent', () => {
+  let tmpDir;
+
+  beforeEach(() => { tmpDir = createTempProject(); });
+  afterEach(() => { cleanup(tmpDir); });
+
+  test('emits W016 when workflow.ai_integration_phase absent from config', () => {
+    writeMinimalHealth(tmpDir);
+    writeConfig(tmpDir, { model_profile: 'balanced', workflow: { research: true, nyquist_validation: true } });
+
+    const result = runGsdTools('validate health', tmpDir);
+    assert.ok(result.success, `Command failed: ${result.error}`);
+
+    const output = JSON.parse(result.output);
+    assert.ok(
+      output.warnings.some(w => w.code === 'W016'),
+      `Expected W016 in warnings: ${JSON.stringify(output.warnings)}`
+    );
+  });
+
+  test('does not emit W016 when workflow.ai_integration_phase is explicitly set', () => {
+    writeMinimalHealth(tmpDir);
+    writeConfig(tmpDir, {
+      model_profile: 'balanced',
+      workflow: { research: true, nyquist_validation: true, ai_integration_phase: true },
+    });
+
+    const result = runGsdTools('validate health', tmpDir);
+    assert.ok(result.success, `Command failed: ${result.error}`);
+
+    const output = JSON.parse(result.output);
+    assert.ok(
+      !output.warnings.some(w => w.code === 'W016'),
+      `Should not have W016: ${JSON.stringify(output.warnings)}`
+    );
+  });
+
+  test('does not emit W016 when workflow.ai_integration_phase is false (explicit opt-out)', () => {
+    writeMinimalHealth(tmpDir);
+    writeConfig(tmpDir, {
+      model_profile: 'balanced',
+      workflow: { research: true, nyquist_validation: true, ai_integration_phase: false },
+    });
+
+    const result = runGsdTools('validate health', tmpDir);
+    assert.ok(result.success, `Command failed: ${result.error}`);
+
+    const output = JSON.parse(result.output);
+    assert.ok(
+      !output.warnings.some(w => w.code === 'W016'),
+      `Should not have W016: ${JSON.stringify(output.warnings)}`
+    );
+  });
+});
+
+// ─── Validate Health --repair: addAiIntegrationPhaseKey ─────────────────────────────────
+
+describe('HEALTH --repair: addAiIntegrationPhaseKey', () => {
+  let tmpDir;
+
+  beforeEach(() => { tmpDir = createTempProject(); });
+  afterEach(() => { cleanup(tmpDir); });
+
+  test('adds workflow.ai_integration_phase via addAiIntegrationPhaseKey repair', () => {
+    writeMinimalHealth(tmpDir);
+    const configPath = path.join(tmpDir, '.planning', 'config.json');
+    fs.writeFileSync(configPath,
+      JSON.stringify({ model_profile: 'balanced', workflow: { research: true, nyquist_validation: true } }, null, 2)
+    );
+
+    const result = runGsdTools('validate health --repair', tmpDir);
+    assert.ok(result.success, `Command failed: ${result.error}`);
+
+    const output = JSON.parse(result.output);
+    const addAction = output.repairs_performed.find(r => r.action === 'addAiIntegrationPhaseKey');
+    assert.ok(addAction, `Expected addAiIntegrationPhaseKey action: ${JSON.stringify(output.repairs_performed)}`);
+    assert.strictEqual(addAction.success, true);
+
+    const config = readConfig(tmpDir);
+    assert.strictEqual(config.workflow.ai_integration_phase, true);
+  });
+});
+
+// ─── AI-SPEC.md Template Structure ───────────────────────────────────────────
+
+describe('TEMPLATE: AI-SPEC.md section completeness', () => {
+  const templatePath = path.join(TEMPLATES_DIR, 'AI-SPEC.md');
+  let content;
+
+  test('AI-SPEC.md template exists', () => {
+    assert.ok(fs.existsSync(templatePath), 'AI-SPEC.md template should exist');
+    content = fs.readFileSync(templatePath, 'utf-8');
+    assert.ok(content.length > 100, 'AI-SPEC.md should be non-empty');
+  });
+
+  const requiredSections = [
+    ['## 1. System Classification',   'Section 1 (System Classification)'],
+    ['## 1b. Domain Context',          'Section 1b (Domain Context)'],
+    ['## 2. Framework Decision',       'Section 2 (Framework Decision)'],
+    ['## 3. Framework Quick Reference','Section 3 (Framework Quick Reference)'],
+    ['## 4. Implementation Guidance',  'Section 4 (Implementation Guidance)'],
+    ['## 4b. AI Systems Best Practices','Section 4b (AI Systems Best Practices)'],
+    ['## 5. Evaluation Strategy',      'Section 5 (Evaluation Strategy)'],
+    ['## 6. Guardrails',               'Section 6 (Guardrails)'],
+    ['## 7. Production Monitoring',    'Section 7 (Production Monitoring)'],
+    ['## Checklist',                   'Checklist section'],
+  ];
+
+  for (const [heading, label] of requiredSections) {
+    test(`template contains ${label}`, () => {
+      const c = fs.readFileSync(templatePath, 'utf-8');
+      assert.ok(c.includes(heading), `Template missing: ${heading}`);
+    });
+  }
+
+  test('template checklist has at least 10 items', () => {
+    const c = fs.readFileSync(templatePath, 'utf-8');
+    const items = (c.match(/^- \[[ x]\]/gm) || []);
+    assert.ok(items.length >= 10, `Expected ≥10 checklist items, found ${items.length}`);
+  });
+
+  test('template Section 1b has domain rubric table columns (Good/Bad/Stakes)', () => {
+    const c = fs.readFileSync(templatePath, 'utf-8');
+    assert.ok(c.includes('What Domain Experts Evaluate Against'), 'Missing domain rubric subsection');
+  });
+
+  test('template Section 4b has Pydantic structured outputs guidance', () => {
+    const c = fs.readFileSync(templatePath, 'utf-8');
+    assert.ok(c.includes('Pydantic'), 'Section 4b missing Pydantic guidance');
+  });
+
+  test('template Section 6 has online guardrails and offline flywheel tables', () => {
+    const c = fs.readFileSync(templatePath, 'utf-8');
+    assert.ok(c.includes('Online'), 'Section 6 missing Online guardrails');
+    assert.ok(c.includes('Offline'), 'Section 6 missing Offline flywheel');
+  });
+});
+
+// ─── Command Frontmatter ──────────────────────────────────────────────────────
+
+describe('COMMAND: ai-integration-phase and eval-review frontmatter', () => {
+  const commands = ['ai-integration-phase', 'eval-review'];
+
+  for (const cmd of commands) {
+    test(`${cmd}.md exists`, () => {
+      const p = path.join(COMMANDS_DIR, `${cmd}.md`);
+      assert.ok(fs.existsSync(p), `commands/gsd/${cmd}.md should exist`);
+    });
+
+    test(`${cmd}.md has name, description, argument-hint`, () => {
+      const content = fs.readFileSync(path.join(COMMANDS_DIR, `${cmd}.md`), 'utf-8');
+      const frontmatter = content.split('---')[1] || '';
+      assert.ok(frontmatter.includes('name:'), `${cmd}.md missing name:`);
+      assert.ok(frontmatter.includes('description:'), `${cmd}.md missing description:`);
+      assert.ok(frontmatter.includes('argument-hint:'), `${cmd}.md missing argument-hint:`);
+    });
+  }
+
+  test('ai-integration-phase.md name is gsd:ai-integration-phase', () => {
+    const content = fs.readFileSync(path.join(COMMANDS_DIR, 'ai-integration-phase.md'), 'utf-8');
+    assert.ok(content.includes('name: gsd:ai-integration-phase'), 'ai-integration-phase command name mismatch');
+  });
+
+  test('eval-review.md name is gsd:eval-review', () => {
+    const content = fs.readFileSync(path.join(COMMANDS_DIR, 'eval-review.md'), 'utf-8');
+    assert.ok(content.includes('name: gsd:eval-review'), 'eval-review command name mismatch');
+  });
+});
+
+// ─── New Agents Exist ─────────────────────────────────────────────────────────
+
+describe('AGENTS: new AI-evals agents exist', () => {
+  const newAgents = [
+    'gsd-framework-selector',
+    'gsd-ai-researcher',
+    'gsd-domain-researcher',
+    'gsd-eval-planner',
+    'gsd-eval-auditor',
+  ];
+
+  for (const agent of newAgents) {
+    test(`${agent}.md exists`, () => {
+      assert.ok(
+        fs.existsSync(path.join(AGENTS_DIR, `${agent}.md`)),
+        `agents/${agent}.md should exist`
+      );
+    });
+  }
+});
+
+// ─── Reference Files ──────────────────────────────────────────────────────────
+
+describe('REFERENCES: ai-frameworks.md and ai-evals.md', () => {
+  const refs = ['ai-frameworks.md', 'ai-evals.md'];
+
+  for (const ref of refs) {
+    test(`${ref} exists and is non-empty`, () => {
+      const p = path.join(REFERENCES_DIR, ref);
+      assert.ok(fs.existsSync(p), `references/${ref} should exist`);
+      const content = fs.readFileSync(p, 'utf-8');
+      assert.ok(content.length > 200, `references/${ref} should have substantial content`);
+    });
+  }
+
+  test('ai-frameworks.md covers key frameworks', () => {
+    const content = fs.readFileSync(path.join(REFERENCES_DIR, 'ai-frameworks.md'), 'utf-8');
+    for (const fw of ['CrewAI', 'LlamaIndex', 'LangChain', 'LangGraph']) {
+      assert.ok(content.includes(fw), `ai-frameworks.md should mention ${fw}`);
+    }
+  });
+
+  test('ai-evals.md covers eval tooling defaults', () => {
+    const content = fs.readFileSync(path.join(REFERENCES_DIR, 'ai-evals.md'), 'utf-8');
+    assert.ok(content.includes('Arize Phoenix') || content.includes('Phoenix'), 'ai-evals.md should mention Arize Phoenix');
+    assert.ok(content.includes('RAGAS'), 'ai-evals.md should mention RAGAS');
+  });
+});
+
+// ─── Workflow: plan-phase Step 4.5 AI keyword nudge ──────────────────────────
+
+describe('WORKFLOW: plan-phase.md AI nudge integration', () => {
+  const planPhasePath = path.join(WORKFLOWS_DIR, 'plan-phase.md');
+
+  test('plan-phase.md exists', () => {
+    assert.ok(fs.existsSync(planPhasePath), 'workflows/plan-phase.md should exist');
+  });
+
+  test('plan-phase.md contains AI keyword detection for LLM/agent/RAG terms', () => {
+    const content = fs.readFileSync(planPhasePath, 'utf-8');
+    assert.ok(
+      content.includes('agent') && content.includes('llm') || content.includes('rag') || content.includes('AI'),
+      'plan-phase.md should contain AI keyword detection'
+    );
+  });
+
+  test('plan-phase.md references /gsd-ai-integration-phase nudge', () => {
+    const content = fs.readFileSync(planPhasePath, 'utf-8');
+    assert.ok(
+      content.includes('ai-integration-phase') || content.includes('ai_integration_phase'),
+      'plan-phase.md should reference ai-integration-phase workflow'
+    );
+  });
+
+  test('plan-phase.md references workflow.ai_integration_phase config toggle', () => {
+    const content = fs.readFileSync(planPhasePath, 'utf-8');
+    assert.ok(
+      content.includes('ai_integration_phase'),
+      'plan-phase.md should check workflow.ai_integration_phase config'
+    );
+  });
+});
+
+// ─── Workflow: ai-integration-phase and eval-review workflows exist ──────────────────────
+
+describe('WORKFLOW: ai-integration-phase and eval-review workflow files', () => {
+  const workflows = ['ai-integration-phase', 'eval-review'];
+
+  for (const wf of workflows) {
+    test(`${wf}.md workflow exists`, () => {
+      assert.ok(
+        fs.existsSync(path.join(WORKFLOWS_DIR, `${wf}.md`)),
+        `workflows/${wf}.md should exist`
+      );
+    });
+  }
+
+  test('ai-integration-phase.md orchestrates 4 agents', () => {
+    const content = fs.readFileSync(path.join(WORKFLOWS_DIR, 'ai-integration-phase.md'), 'utf-8');
+    for (const agent of ['gsd-framework-selector', 'gsd-ai-researcher', 'gsd-domain-researcher', 'gsd-eval-planner']) {
+      assert.ok(content.includes(agent), `ai-integration-phase.md should reference ${agent}`);
+    }
+  });
+
+  test('eval-review.md references gsd-eval-auditor', () => {
+    const content = fs.readFileSync(path.join(WORKFLOWS_DIR, 'eval-review.md'), 'utf-8');
+    assert.ok(content.includes('gsd-eval-auditor'), 'eval-review.md should reference gsd-eval-auditor');
+  });
+
+  test('select-framework.md does NOT exist (removed per design)', () => {
+    assert.ok(
+      !fs.existsSync(path.join(WORKFLOWS_DIR, 'select-framework.md')),
+      'select-framework.md should not exist — removed in favour of ai-integration-phase nudge'
+    );
+  });
+});
--- a/tests/ask-user-questions-fallback.test.cjs
+++ b/tests/ask-user-questions-fallback.test.cjs
@@ -0,0 +1,83 @@
+/**
+ * Regression guard for #2012: AskUserQuestion is Claude Code-only — non-Claude
+ * runtimes (OpenAI Codex, Gemini, etc.) render it as a markdown code block
+ * instead of triggering the interactive TUI, so the session stalls.
+ *
+ * Every workflow that calls AskUserQuestion MUST include a TEXT_MODE fallback
+ * instruction so that, when `workflow.text_mode` is true (or `--text` is
+ * passed), all AskUserQuestion calls are replaced with plain-text numbered
+ * lists that any runtime can handle.
+ *
+ * The canonical fallback phrase is:
+ *   "TEXT_MODE" (or "text_mode") paired with "plain-text" (or "plain text")
+ * near the first AskUserQuestion reference in the file.
+ */
+'use strict';
+
+const { describe, test } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const path = require('path');
+
+const ROOT = path.join(__dirname, '..');
+const WORKFLOWS_DIR = path.join(ROOT, 'get-shit-done', 'workflows');
+
+/**
+ * Return true if the file content contains a TEXT_MODE / text_mode fallback
+ * instruction for AskUserQuestion calls.
+ *
+ * Acceptable forms (case-insensitive on key terms):
+ *   - "TEXT_MODE" + "plain-text" or "plain text"
+ *   - "text_mode" + "plain-text" or "plain text"
+ *   - "text mode" + "plain-text" or "plain text"
+ */
+function hasTextModeFallback(content) {
+  const lower = content.toLowerCase();
+  const hasTextMode =
+    lower.includes('text_mode') ||
+    lower.includes('text mode');
+  const hasPlainText =
+    lower.includes('plain-text') ||
+    lower.includes('plain text') ||
+    lower.includes('numbered list');
+  return hasTextMode && hasPlainText;
+}
+
+describe('AskUserQuestion text-mode fallback (#2012)', () => {
+  test('every workflow that uses AskUserQuestion includes a TEXT_MODE plain-text fallback', () => {
+    const violations = [];
+
+    const files = fs.readdirSync(WORKFLOWS_DIR).filter(f => f.endsWith('.md'));
+
+    for (const fname of files) {
+      const fpath = path.join(WORKFLOWS_DIR, fname);
+      const content = fs.readFileSync(fpath, 'utf-8');
+
+      if (!content.includes('AskUserQuestion')) continue;
+
+      if (!hasTextModeFallback(content)) {
+        violations.push(fname);
+      }
+    }
+
+    assert.strictEqual(
+      violations.length,
+      0,
+      [
+        'AskUserQuestion is Claude Code-only (issue #2012).',
+        'Every workflow that uses AskUserQuestion must include a TEXT_MODE fallback',
+        'so non-Claude runtimes (OpenAI Codex, Gemini, etc.) can present questions',
+        'as plain-text numbered lists instead of stalling on an unexecuted tool call.',
+        '',
+        'Add this near the argument-parsing section of each workflow:',
+        '  Set TEXT_MODE=true if --text is present in $ARGUMENTS OR text_mode from',
+        '  init JSON is true. When TEXT_MODE is active, replace every AskUserQuestion',
+        '  call with a plain-text numbered list and ask the user to type their choice',
+        '  number.',
+        '',
+        'Workflows missing the fallback:',
+        ...violations.map(v => '  get-shit-done/workflows/' + v),
+      ].join('\n')
+    );
+  });
+});
--- a/tests/atomic-write.test.cjs
+++ b/tests/atomic-write.test.cjs
@@ -0,0 +1,78 @@
+/**
+ * Tests for atomicWriteFileSync helper (issue #1915)
+ */
+
+const { test, describe, beforeEach, afterEach } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const path = require('path');
+const { createTempDir, cleanup } = require('./helpers.cjs');
+
+const CORE_PATH = path.join(__dirname, '..', 'get-shit-done', 'bin', 'lib', 'core.cjs');
+
+describe('atomicWriteFileSync', () => {
+  let tmpDir;
+
+  beforeEach(() => {
+    tmpDir = createTempDir();
+  });
+
+  afterEach(() => {
+    cleanup(tmpDir);
+  });
+
+  test('is exported from core.cjs', () => {
+    const core = require(CORE_PATH);
+    assert.strictEqual(typeof core.atomicWriteFileSync, 'function', 'atomicWriteFileSync must be exported');
+  });
+
+  test('writes correct content to the target file', () => {
+    const { atomicWriteFileSync } = require(CORE_PATH);
+    const filePath = path.join(tmpDir, 'test.md');
+    const content = '# Hello\nworld\n';
+
+    atomicWriteFileSync(filePath, content, 'utf-8');
+
+    const written = fs.readFileSync(filePath, 'utf-8');
+    assert.strictEqual(written, content, 'written content must match');
+  });
+
+  test('does not leave .tmp.* files after successful write', () => {
+    const { atomicWriteFileSync } = require(CORE_PATH);
+    const filePath = path.join(tmpDir, 'STATE.md');
+
+    atomicWriteFileSync(filePath, '# State\n', 'utf-8');
+
+    const entries = fs.readdirSync(tmpDir);
+    const tmpFiles = entries.filter(e => e.includes('.tmp.'));
+    assert.deepStrictEqual(tmpFiles, [], 'no .tmp.* files should remain after write');
+  });
+
+  test('overwrites an existing file with new content', () => {
+    const { atomicWriteFileSync } = require(CORE_PATH);
+    const filePath = path.join(tmpDir, 'config.json');
+
+    atomicWriteFileSync(filePath, '{"first":true}', 'utf-8');
+    atomicWriteFileSync(filePath, '{"second":true}', 'utf-8');
+
+    const written = fs.readFileSync(filePath, 'utf-8');
+    assert.strictEqual(written, '{"second":true}', 'second write must replace first');
+  });
+
+  test('cleans up stale tmp file if present before write', () => {
+    const { atomicWriteFileSync } = require(CORE_PATH);
+    const filePath = path.join(tmpDir, 'ROADMAP.md');
+    // Place a stale tmp file matching the pattern used by atomicWriteFileSync
+    const staleTmp = filePath + '.tmp.' + process.pid;
+    fs.writeFileSync(staleTmp, 'stale content', 'utf-8');
+
+    atomicWriteFileSync(filePath, '# Roadmap\n', 'utf-8');
+
+    const entries = fs.readdirSync(tmpDir);
+    const tmpFiles = entries.filter(e => e.includes('.tmp.'));
+    assert.deepStrictEqual(tmpFiles, [], 'stale .tmp.* file must be gone after write');
+
+    const written = fs.readFileSync(filePath, 'utf-8');
+    assert.strictEqual(written, '# Roadmap\n', 'target file must have correct content');
+  });
+});
--- a/tests/autonomous-allowed-tools.test.cjs
+++ b/tests/autonomous-allowed-tools.test.cjs
@@ -0,0 +1,37 @@
+/**
+ * Regression test for #2043 — autonomous.md must include Agent in allowed-tools.
+ *
+ * The gsd-autonomous skill spawns background agents via Agent(..., run_in_background=true).
+ * Without Agent in allowed-tools the runtime rejects those calls silently.
+ */
+
+'use strict';
+
+const { describe, test } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const path = require('path');
+
+describe('commands/gsd/autonomous.md allowed-tools', () => {
+  test('includes Agent in allowed-tools list', () => {
+    const filePath = path.join(__dirname, '..', 'commands', 'gsd', 'autonomous.md');
+    const content = fs.readFileSync(filePath, 'utf-8');
+
+    // Extract the YAML frontmatter block between the first pair of --- delimiters
+    const frontmatterMatch = content.match(/^---\n([\s\S]*?)\n---/);
+    assert.ok(frontmatterMatch, 'autonomous.md must have YAML frontmatter');
+
+    const frontmatter = frontmatterMatch[1];
+
+    // Parse the allowed-tools list items (lines starting with "  - ")
+    const toolLines = frontmatter
+      .split('\n')
+      .filter((line) => /^\s+-\s+/.test(line))
+      .map((line) => line.replace(/^\s+-\s+/, '').trim());
+
+    assert.ok(
+      toolLines.includes('Agent'),
+      `allowed-tools must include "Agent" but found: [${toolLines.join(', ')}]`
+    );
+  });
+});
--- a/tests/bug-1754-js-hook-guard.test.cjs
+++ b/tests/bug-1754-js-hook-guard.test.cjs
@@ -0,0 +1,103 @@
+/**
+ * Regression tests for bug #1754
+ *
+ * The installer must NOT register .js hook entries in settings.json when the
+ * corresponding .js file does not exist at the target path. The original bug:
+ * on fresh installs where hooks/dist/ was missing from the npm package (as in
+ * v1.32.0), the hook copy step produced no files, yet the registration step
+ * ran unconditionally for .js hooks — leaving users with "PreToolUse:Bash
+ * hook error" on every tool invocation.
+ *
+ * The .sh hooks already had fs.existsSync() guards (added in #1817). This
+ * test verifies the same defensive pattern exists for all .js hooks.
+ */
+
+'use strict';
+
+const { describe, test, before } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const path = require('path');
+
+const INSTALL_SRC = path.join(__dirname, '..', 'bin', 'install.js');
+
+const JS_HOOKS = [
+  { name: 'gsd-check-update.js',      registrationAnchor: 'hasGsdUpdateHook' },
+  { name: 'gsd-context-monitor.js',   registrationAnchor: 'hasContextMonitorHook' },
+  { name: 'gsd-prompt-guard.js',      registrationAnchor: 'hasPromptGuardHook' },
+  { name: 'gsd-read-guard.js',        registrationAnchor: 'hasReadGuardHook' },
+  { name: 'gsd-workflow-guard.js',    registrationAnchor: 'hasWorkflowGuardHook' },
+];
+
+describe('bug #1754: .js hook registration guards', () => {
+  let src;
+
+  before(() => {
+    src = fs.readFileSync(INSTALL_SRC, 'utf-8');
+  });
+
+  for (const { name, registrationAnchor } of JS_HOOKS) {
+    describe(`${name} registration`, () => {
+      test(`install.js checks file existence before registering ${name}`, () => {
+        // Find the registration block by locating the "has...Hook" variable
+        const anchorIdx = src.indexOf(registrationAnchor);
+        assert.ok(
+          anchorIdx !== -1,
+          `${registrationAnchor} variable not found in install.js`
+        );
+
+        // Extract a window around the registration block to find the guard
+        const blockStart = anchorIdx;
+        const blockEnd = Math.min(src.length, anchorIdx + 1200);
+        const block = src.slice(blockStart, blockEnd);
+
+        // The block must contain an fs.existsSync check for the hook file
+        assert.ok(
+          block.includes('fs.existsSync') || block.includes('existsSync'),
+          `install.js must call fs.existsSync on the target path before registering ${name} ` +
+          `in settings.json. Without this guard, hooks are registered even when the .js file ` +
+          `was never copied (the root cause of #1754).`
+        );
+      });
+
+      test(`install.js emits a warning when ${name} is missing`, () => {
+        // The hook file name (without extension) should appear in a warning message
+        const hookBaseName = name.replace('.js', '');
+        const warnPattern = `Skipped`;
+        const anchorIdx = src.indexOf(registrationAnchor);
+        const block = src.slice(anchorIdx, Math.min(src.length, anchorIdx + 1200));
+
+        assert.ok(
+          block.includes(warnPattern) && block.includes(hookBaseName),
+          `install.js must emit a skip warning when ${name} is not found at the target path`
+        );
+      });
+    });
+  }
+
+  test('all .js hooks use the same guard pattern as .sh hooks', () => {
+    // Count existsSync calls in the hook registration section.
+    // There should be guards for all JS hooks plus the existing SH hooks.
+    // This test ensures new hooks added in the future follow the same pattern.
+    const registrationSection = src.slice(
+      src.indexOf('// Configure SessionStart hook'),
+      src.indexOf('return { settingsPath, settings, statuslineCommand')
+    );
+
+    // Count unique hook file existence checks (pattern: path.join(targetDir, 'hooks', 'gsd-*.js'))
+    const jsGuards = (registrationSection.match(/gsd-[\w-]+\.js.*not found at target/g) || []);
+    const shGuards = (registrationSection.match(/gsd-[\w-]+\.sh.*not found at target/g) || []);
+
+    assert.ok(
+      jsGuards.length >= JS_HOOKS.length,
+      `Expected at least ${JS_HOOKS.length} .js hook guards, found ${jsGuards.length}. ` +
+      `Every .js hook registration must check file existence before registering.`
+    );
+
+    assert.ok(
+      shGuards.length >= 3,
+      `Expected at least 3 .sh hook guards (validate-commit, session-state, phase-boundary), ` +
+      `found ${shGuards.length}.`
+    );
+  });
+});
--- a/tests/bug-1891-file-resolution.test.cjs
+++ b/tests/bug-1891-file-resolution.test.cjs
@@ -0,0 +1,58 @@
+/**
+ * Regression tests for bug #1891
+ *
+ * gsd-tools.cjs must transparently resolve @file: references in stdout
+ * so that workflows never see the @file: prefix. This eliminates the
+ * bash-specific `if [[ "$INIT" == @file:* ]]` check that breaks on
+ * PowerShell and other non-bash shells.
+ */
+
+'use strict';
+
+const { describe, test, before } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const path = require('path');
+
+const GSD_TOOLS_SRC = path.join(__dirname, '..', 'get-shit-done', 'bin', 'gsd-tools.cjs');
+
+describe('bug #1891: @file: resolution in gsd-tools.cjs', () => {
+  let src;
+
+  before(() => {
+    src = fs.readFileSync(GSD_TOOLS_SRC, 'utf-8');
+  });
+
+  test('main() intercepts stdout and resolves @file: references', () => {
+    // The non-pick path should have @file: resolution, just like the --pick path
+    assert.ok(
+      src.includes("captured.startsWith('@file:')") ||
+      src.includes('captured.startsWith(\'@file:\')'),
+      'main() should check for @file: prefix in captured output'
+    );
+  });
+
+  test('@file: resolution reads file content via readFileSync', () => {
+    // Verify the resolution reads the actual file
+    assert.ok(
+      src.includes("readFileSync(captured.slice(6)") ||
+      src.includes('readFileSync(captured.slice(6)'),
+      '@file: resolution should read file at the path after the prefix'
+    );
+  });
+
+  test('stdout interception wraps runCommand in the non-pick path', () => {
+    // The main function should intercept fs.writeSync for fd=1
+    // in BOTH the pick path AND the normal path
+    const mainFunc = src.slice(src.indexOf('async function main()'));
+    const pickInterception = mainFunc.indexOf('// When --pick is active');
+    const fileResolution = mainFunc.indexOf('@file:');
+
+    // There should be at least two @file: resolution points:
+    // one in the --pick path and one in the normal path
+    const firstAt = mainFunc.indexOf("'@file:'");
+    const secondAt = mainFunc.indexOf("'@file:'", firstAt + 1);
+    assert.ok(secondAt > firstAt,
+      'Both --pick and normal paths should resolve @file: references');
+  });
+});
--- a/tests/bug-1906-hook-relative-paths.test.cjs
+++ b/tests/bug-1906-hook-relative-paths.test.cjs
@@ -0,0 +1,82 @@
+/**
+ * Regression tests for bug #1906
+ *
+ * Local installs must anchor hook command paths to $CLAUDE_PROJECT_DIR so
+ * hooks resolve correctly regardless of the shell's current working directory.
+ *
+ * The original bug: local install hook commands used bare relative paths like
+ * `node .claude/hooks/gsd-context-monitor.js`. Claude Code persists the bash
+ * tool's cwd between calls, so a single `cd subdir && …` early in a session
+ * permanently broke every hook for the rest of that session.
+ *
+ * The fix prefixes all local hook commands with "$CLAUDE_PROJECT_DIR"/ so
+ * path resolution is always anchored to the project root.
+ */
+
+'use strict';
+
+const { describe, test, before } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const path = require('path');
+
+const INSTALL_SRC = path.join(__dirname, '..', 'bin', 'install.js');
+
+// All hooks that the installer registers for local installs
+const HOOKS = [
+  'gsd-statusline.js',
+  'gsd-check-update.js',
+  'gsd-context-monitor.js',
+  'gsd-prompt-guard.js',
+  'gsd-read-guard.js',
+  'gsd-workflow-guard.js',
+  'gsd-validate-commit.sh',
+  'gsd-session-state.sh',
+  'gsd-phase-boundary.sh',
+];
+
+describe('bug #1906: local hook commands use $CLAUDE_PROJECT_DIR', () => {
+  let src;
+
+  before(() => {
+    src = fs.readFileSync(INSTALL_SRC, 'utf-8');
+  });
+
+  test('localPrefix variable is defined with $CLAUDE_PROJECT_DIR', () => {
+    assert.match(src, /const localPrefix\s*=\s*['"]\"\$CLAUDE_PROJECT_DIR['"]\s*\//,
+      'localPrefix should be defined using $CLAUDE_PROJECT_DIR');
+  });
+
+  for (const hook of HOOKS) {
+    test(`${hook} local command uses localPrefix (not bare dirName)`, () => {
+      // Find all local command strings for this hook
+      // The pattern is: `<runner> ' + localPrefix + '/hooks/<hook>'`
+      // or the old broken pattern: `<runner> ' + dirName + '/hooks/<hook>'`
+      const hookEscaped = hook.replace(/\./g, '\\.');
+      const brokenPattern = new RegExp(
+        `['"](?:node|bash)\\s['"]\\s*\\+\\s*dirName\\s*\\+\\s*['"]/hooks/${hookEscaped}['"]`
+      );
+      assert.ok(
+        !brokenPattern.test(src),
+        `${hook} must not use bare dirName — should use localPrefix for cwd-independent resolution`
+      );
+    });
+  }
+
+  test('no local hook command uses bare dirName + /hooks/', () => {
+    // Broader check: no local (non-global) hook path should use dirName directly
+    // The pattern `': '<runner> ' + dirName + '/hooks/'` is the broken form
+    const lines = src.split('\n');
+    const offenders = [];
+    for (let i = 0; i < lines.length; i++) {
+      const line = lines[i];
+      // Match lines that build local hook commands with bare dirName
+      if (/['"](?:node|bash)\s['"][^;]*\+\s*dirName\s*\+\s*['"]\/hooks\//.test(line)) {
+        offenders.push(`line ${i + 1}: ${line.trim()}`);
+      }
+    }
+    assert.equal(offenders.length, 0,
+      'Found local hook commands using bare dirName instead of localPrefix:\n' +
+      offenders.join('\n'));
+  });
+});
--- a/tests/bug-1908-uninstall-manifest.test.cjs
+++ b/tests/bug-1908-uninstall-manifest.test.cjs
@@ -0,0 +1,133 @@
+/**
+ * Regression test for bug #1908
+ *
+ * `--uninstall` did not remove `gsd-file-manifest.json` from the target
+ * directory, leaving a stale metadata file after uninstall.
+ *
+ * Fix: `uninstall()` must call
+ *   fs.rmSync(path.join(targetDir, MANIFEST_NAME), { force: true })
+ * after cleaning up the rest of the GSD artefacts.
+ */
+
+'use strict';
+
+process.env.GSD_TEST_MODE = '1';
+
+const { describe, test, beforeEach, afterEach } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+
+const { uninstall } = require('../bin/install.js');
+
+const MANIFEST_NAME = 'gsd-file-manifest.json';
+
+// ─── helpers ──────────────────────────────────────────────────────────────────
+
+function createFakeInstall(prefix = 'gsd-uninstall-test-') {
+  const dir = fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+
+  // Simulate the minimum directory/file layout produced by the installer:
+  // get-shit-done/ directory, agents/ directory, and the manifest file.
+  fs.mkdirSync(path.join(dir, 'get-shit-done', 'workflows'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'get-shit-done', 'workflows', 'execute-phase.md'), '# stub');
+
+  fs.mkdirSync(path.join(dir, 'agents'), { recursive: true });
+  fs.writeFileSync(path.join(dir, 'agents', 'gsd-executor.md'), '# stub');
+
+  const manifest = {
+    version: '1.34.0',
+    timestamp: new Date().toISOString(),
+    files: {
+      'get-shit-done/workflows/execute-phase.md': 'abc123',
+      'agents/gsd-executor.md': 'def456',
+    },
+  };
+  fs.writeFileSync(path.join(dir, MANIFEST_NAME), JSON.stringify(manifest, null, 2));
+
+  return dir;
+}
+
+function cleanup(dir) {
+  try { fs.rmSync(dir, { recursive: true, force: true }); } catch {}
+}
+
+// ─── tests ────────────────────────────────────────────────────────────────────
+
+describe('uninstall — manifest cleanup (#1908)', () => {
+  let tmpDir;
+
+  beforeEach(() => {
+    tmpDir = createFakeInstall();
+  });
+
+  afterEach(() => {
+    cleanup(tmpDir);
+  });
+
+  test('gsd-file-manifest.json is removed after global uninstall', () => {
+    const manifestPath = path.join(tmpDir, MANIFEST_NAME);
+
+    // Pre-condition: manifest exists before uninstall
+    assert.ok(
+      fs.existsSync(manifestPath),
+      'Test setup failure: manifest file should exist before uninstall'
+    );
+
+    // Run uninstall against tmpDir (pass it via CLAUDE_CONFIG_DIR so getGlobalDir()
+    // resolves to our temp directory; pass isGlobal=true)
+    const savedEnv = process.env.CLAUDE_CONFIG_DIR;
+    process.env.CLAUDE_CONFIG_DIR = tmpDir;
+    try {
+      uninstall(true, 'claude');
+    } finally {
+      if (savedEnv === undefined) {
+        delete process.env.CLAUDE_CONFIG_DIR;
+      } else {
+        process.env.CLAUDE_CONFIG_DIR = savedEnv;
+      }
+    }
+
+    assert.ok(
+      !fs.existsSync(manifestPath),
+      [
+        `${MANIFEST_NAME} must be removed by uninstall() but still exists at`,
+        manifestPath,
+      ].join(' ')
+    );
+  });
+
+  test('gsd-file-manifest.json is removed after local uninstall', () => {
+    const manifestPath = path.join(tmpDir, MANIFEST_NAME);
+
+    assert.ok(
+      fs.existsSync(manifestPath),
+      'Test setup failure: manifest file should exist before uninstall'
+    );
+
+    // For a local install, getGlobalDir is not called — targetDir = cwd + dirName.
+    // Simulate by creating .claude/ inside tmpDir and placing artefacts there.
+    const localDir = path.join(tmpDir, '.claude');
+    fs.mkdirSync(path.join(localDir, 'get-shit-done', 'workflows'), { recursive: true });
+    fs.writeFileSync(path.join(localDir, 'get-shit-done', 'workflows', 'execute-phase.md'), '# stub');
+    const localManifestPath = path.join(localDir, MANIFEST_NAME);
+    fs.writeFileSync(localManifestPath, JSON.stringify({ version: '1.34.0', files: {} }, null, 2));
+
+    const savedCwd = process.cwd();
+    process.chdir(tmpDir);
+    try {
+      uninstall(false, 'claude');
+    } finally {
+      process.chdir(savedCwd);
+    }
+
+    assert.ok(
+      !fs.existsSync(localManifestPath),
+      [
+        `${MANIFEST_NAME} must be removed by uninstall() (local) but still exists at`,
+        localManifestPath,
+      ].join(' ')
+    );
+  });
+});
--- a/tests/bug-1924-preserve-user-artifacts.test.cjs
+++ b/tests/bug-1924-preserve-user-artifacts.test.cjs
@@ -0,0 +1,264 @@
+/**
+ * Regression tests for bug #1924: gsd-update silently deletes user-generated files
+ *
+ * Running the installer (gsd-update / re-install) must not delete:
+ *   - get-shit-done/USER-PROFILE.md  (created by /gsd-profile-user)
+ *   - commands/gsd/dev-preferences.md  (created by /gsd-profile-user)
+ *
+ * Root cause:
+ *   1. copyWithPathReplacement() calls fs.rmSync(destDir, {recursive:true}) before
+ *      copying — no preserve allowlist. This wipes USER-PROFILE.md.
+ *   2. ~line 5211 explicitly rmSync's commands/gsd/ during global install legacy
+ *      cleanup — no preserve. This wipes dev-preferences.md.
+ *
+ * Fix requirement:
+ *   - install() must preserve USER-PROFILE.md across the get-shit-done/ wipe
+ *   - install() must preserve dev-preferences.md across the commands/gsd/ wipe
+ *
+ * Closes: #1924
+ */
+
+'use strict';
+
+const { describe, test, beforeEach, afterEach, before } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const path = require('path');
+const os = require('os');
+const { execFileSync } = require('child_process');
+
+const INSTALL_SCRIPT = path.join(__dirname, '..', 'bin', 'install.js');
+const BUILD_SCRIPT = path.join(__dirname, '..', 'scripts', 'build-hooks.js');
+
+// ─── Ensure hooks/dist/ is populated before any install test ─────────────────
+
+before(() => {
+  execFileSync(process.execPath, [BUILD_SCRIPT], {
+    encoding: 'utf-8',
+    stdio: 'pipe',
+  });
+});
+
+// ─── Helpers ─────────────────────────────────────────────────────────────────
+
+function createTempDir(prefix) {
+  return fs.mkdtempSync(path.join(os.tmpdir(), prefix));
+}
+
+function cleanup(dir) {
+  try { fs.rmSync(dir, { recursive: true, force: true }); } catch { /* ignore */ }
+}
+
+/**
+ * Run the installer with CLAUDE_CONFIG_DIR redirected to a temp directory.
+ * Explicitly removes GSD_TEST_MODE so the subprocess actually runs the installer
+ * (not just the export block). Uses --yes to suppress interactive prompts.
+ */
+function runInstaller(configDir) {
+  const env = { ...process.env, CLAUDE_CONFIG_DIR: configDir };
+  delete env.GSD_TEST_MODE;
+  execFileSync(process.execPath, [INSTALL_SCRIPT, '--claude', '--global', '--yes'], {
+    encoding: 'utf-8',
+    stdio: 'pipe',
+    env,
+  });
+}
+
+// ─── Test 1: USER-PROFILE.md is preserved across re-install ─────────────────
+
+describe('#1924: USER-PROFILE.md preserved across re-install (global Claude)', () => {
+  let tmpDir;
+
+  beforeEach(() => {
+    tmpDir = createTempDir('gsd-1924-userprofile-');
+  });
+
+  afterEach(() => {
+    cleanup(tmpDir);
+  });
+
+  test('USER-PROFILE.md exists after initial install + user creation', () => {
+    runInstaller(tmpDir);
+
+    // Simulate /gsd-profile-user creating USER-PROFILE.md inside get-shit-done/
+    const profilePath = path.join(tmpDir, 'get-shit-done', 'USER-PROFILE.md');
+    fs.writeFileSync(profilePath, '# My Profile\n\nCustom user content.\n');
+
+    assert.ok(
+      fs.existsSync(profilePath),
+      'USER-PROFILE.md should exist after being created by /gsd-profile-user'
+    );
+  });
+
+  test('USER-PROFILE.md is preserved after re-install', () => {
+    // First install
+    runInstaller(tmpDir);
+
+    // User runs /gsd-profile-user, creating USER-PROFILE.md
+    const profilePath = path.join(tmpDir, 'get-shit-done', 'USER-PROFILE.md');
+    const originalContent = '# My Profile\n\nThis is my custom user profile content.\n';
+    fs.writeFileSync(profilePath, originalContent);
+
+    // Re-run installer (simulating gsd-update)
+    runInstaller(tmpDir);
+
+    assert.ok(
+      fs.existsSync(profilePath),
+      'USER-PROFILE.md must survive re-install — gsd-update must not delete user-generated profiles'
+    );
+
+    const afterContent = fs.readFileSync(profilePath, 'utf8');
+    assert.strictEqual(
+      afterContent,
+      originalContent,
+      'USER-PROFILE.md content must be identical after re-install'
+    );
+  });
+
+  test('USER-PROFILE.md is preserved even when get-shit-done/ is wiped and recreated', () => {
+    runInstaller(tmpDir);
+
+    const gsdDir = path.join(tmpDir, 'get-shit-done');
+    const profilePath = path.join(gsdDir, 'USER-PROFILE.md');
+
+    // Confirm get-shit-done/ was created by install
+    assert.ok(fs.existsSync(gsdDir), 'get-shit-done/ must exist after install');
+
+    // Write profile
+    fs.writeFileSync(profilePath, '# Profile\n\nMy coding style preferences.\n');
+
+    // Re-install
+    runInstaller(tmpDir);
+
+    // get-shit-done/ must still exist AND profile must be intact
+    assert.ok(fs.existsSync(gsdDir), 'get-shit-done/ must still exist after re-install');
+    assert.ok(
+      fs.existsSync(profilePath),
+      'USER-PROFILE.md must still exist after get-shit-done/ was wiped and recreated'
+    );
+  });
+});
+
+// ─── Test 2: dev-preferences.md is preserved across re-install ───────────────
+
+describe('#1924: dev-preferences.md preserved across re-install (global Claude)', () => {
+  let tmpDir;
+
+  beforeEach(() => {
+    tmpDir = createTempDir('gsd-1924-devprefs-');
+  });
+
+  afterEach(() => {
+    cleanup(tmpDir);
+  });
+
+  test('dev-preferences.md is preserved when commands/gsd/ is cleaned up during re-install', () => {
+    // First install (creates skills/ structure for global Claude)
+    runInstaller(tmpDir);
+
+    // User runs /gsd-profile-user — it creates dev-preferences.md in commands/gsd/
+    const commandsGsdDir = path.join(tmpDir, 'commands', 'gsd');
+    fs.mkdirSync(commandsGsdDir, { recursive: true });
+    const devPrefsPath = path.join(commandsGsdDir, 'dev-preferences.md');
+    const originalContent = '# Dev Preferences\n\nI prefer TDD. I like short functions.\n';
+    fs.writeFileSync(devPrefsPath, originalContent);
+
+    // Re-run installer (simulating gsd-update)
+    // Bug: this triggers legacy cleanup that rmSync's commands/gsd/ entirely,
+    // deleting dev-preferences.md
+    runInstaller(tmpDir);
+
+    assert.ok(
+      fs.existsSync(devPrefsPath),
+      'dev-preferences.md must survive re-install — gsd-update legacy cleanup must not delete user-generated files'
+    );
+
+    const afterContent = fs.readFileSync(devPrefsPath, 'utf8');
+    assert.strictEqual(
+      afterContent,
+      originalContent,
+      'dev-preferences.md content must be identical after re-install'
+    );
+  });
+
+  test('legacy non-user GSD commands are still cleaned up during re-install', () => {
+    // First install
+    runInstaller(tmpDir);
+
+    // Simulate a legacy GSD command file being left in commands/gsd/
+    const commandsGsdDir = path.join(tmpDir, 'commands', 'gsd');
+    fs.mkdirSync(commandsGsdDir, { recursive: true });
+    const legacyFile = path.join(commandsGsdDir, 'next.md');
+    fs.writeFileSync(legacyFile, '---\nname: gsd:next\n---\n\nLegacy content.');
+
+    // But dev-preferences.md is also there (user-generated)
+    const devPrefsPath = path.join(commandsGsdDir, 'dev-preferences.md');
+    fs.writeFileSync(devPrefsPath, '# Dev Preferences\n\nMy preferences.\n');
+
+    // Re-install
+    runInstaller(tmpDir);
+
+    // dev-preferences.md must be preserved
+    assert.ok(
+      fs.existsSync(devPrefsPath),
+      'dev-preferences.md must be preserved while legacy commands/gsd/ is cleaned up'
+    );
+
+    // The legacy GSD command (next.md) is NOT user-generated, should be removed
+    // (it would exist only as a skill now in skills/gsd-next/SKILL.md)
+    assert.ok(
+      !fs.existsSync(legacyFile),
+      'legacy GSD command next.md in commands/gsd/ must be removed during cleanup'
+    );
+  });
+});
+
+// ─── Test 3: profile-user.md backup path is outside get-shit-done/ ───────────
+
+describe('#1924: profile-user.md backup path must be outside get-shit-done/', () => {
+  test('profile-user.md backup uses ~/.claude/USER-PROFILE.backup.md not ~/.claude/get-shit-done/USER-PROFILE.backup.md', () => {
+    const workflowPath = path.join(
+      __dirname, '..', 'get-shit-done', 'workflows', 'profile-user.md'
+    );
+    const content = fs.readFileSync(workflowPath, 'utf8');
+
+    // The backup must NOT be inside get-shit-done/ because that directory is wiped on update
+    assert.ok(
+      !content.includes('get-shit-done/USER-PROFILE.backup.md'),
+      'backup path must NOT be inside get-shit-done/ — that directory is wiped on gsd-update'
+    );
+
+    // The backup should be at ~/.claude/USER-PROFILE.backup.md (outside get-shit-done/)
+    assert.ok(
+      content.includes('USER-PROFILE.backup.md') &&
+      !content.includes('/get-shit-done/USER-PROFILE.backup.md'),
+      'backup path must be outside get-shit-done/ (e.g. ~/.claude/USER-PROFILE.backup.md)'
+    );
+  });
+});
+
+// ─── Test 4: preserveUserArtifacts helper exported from install.js ────────────
+
+describe('#1924: preserveUserArtifacts helper exists in install.js', () => {
+  test('install.js exports preserveUserArtifacts function', () => {
+    // Set GSD_TEST_MODE so require() reaches the module.exports block
+    const origMode = process.env.GSD_TEST_MODE;
+    process.env.GSD_TEST_MODE = '1';
+    let mod;
+    try {
+      mod = require(INSTALL_SCRIPT);
+    } finally {
+      if (origMode === undefined) {
+        delete process.env.GSD_TEST_MODE;
+      } else {
+        process.env.GSD_TEST_MODE = origMode;
+      }
+    }
+
+    assert.strictEqual(
+      typeof mod.preserveUserArtifacts,
+      'function',
+      'install.js must export preserveUserArtifacts helper for testability'
+    );
+  });
+});
--- a/tests/bug-1962-phase-suffix-case.test.cjs
+++ b/tests/bug-1962-phase-suffix-case.test.cjs
@@ -0,0 +1,49 @@
+/**
+ * Regression tests for bug #1962
+ *
+ * normalizePhaseName must preserve the original case of letter suffixes.
+ * Uppercasing "16c" to "16C" causes directory/roadmap mismatches on
+ * case-sensitive filesystems — init progress can't match the directory
+ * back to the roadmap phase entry.
+ */
+
+'use strict';
+
+const { describe, test } = require('node:test');
+const assert = require('node:assert/strict');
+
+const { normalizePhaseName } = require('../get-shit-done/bin/lib/core.cjs');
+
+describe('bug #1962: normalizePhaseName preserves letter suffix case', () => {
+  test('lowercase suffix preserved: 16c → 16c', () => {
+    assert.equal(normalizePhaseName('16c'), '16c');
+  });
+
+  test('uppercase suffix preserved: 16C → 16C', () => {
+    assert.equal(normalizePhaseName('16C'), '16C');
+  });
+
+  test('single digit padded with lowercase suffix: 1a → 01a', () => {
+    assert.equal(normalizePhaseName('1a'), '01a');
+  });
+
+  test('single digit padded with uppercase suffix: 1A → 01A', () => {
+    assert.equal(normalizePhaseName('1A'), '01A');
+  });
+
+  test('no suffix unchanged: 16 → 16', () => {
+    assert.equal(normalizePhaseName('16'), '16');
+  });
+
+  test('decimal suffix preserved: 16.1 → 16.1', () => {
+    assert.equal(normalizePhaseName('16.1'), '16.1');
+  });
+
+  test('letter + decimal preserved: 16c.2 → 16c.2', () => {
+    assert.equal(normalizePhaseName('16c.2'), '16c.2');
+  });
+
+  test('project code prefix stripped, suffix case preserved: CK-01a → 01a', () => {
+    assert.equal(normalizePhaseName('CK-01a'), '01a');
+  });
+});
--- a/tests/bug-1998-phase-complete-checkbox.test.cjs
+++ b/tests/bug-1998-phase-complete-checkbox.test.cjs
@@ -0,0 +1,137 @@
+/**
+ * Regression tests for bug #1998
+ *
+ * phase complete must update the top-level overview bullet checkbox
+ * (- [ ] Phase N: → - [x] Phase N:) in addition to the Progress table row.
+ *
+ * Root cause: the checkbox update used replaceInCurrentMilestone() which
+ * scopes to content after </details>, missing the current milestone's
+ * overview bullets that appear before any <details> blocks.
+ */
+
+'use strict';
+
+const { describe, test, beforeEach, afterEach } = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('node:fs');
+const path = require('node:path');
+const os = require('node:os');
+const { execFileSync } = require('node:child_process');
+
+const gsdTools = path.resolve(__dirname, '..', 'get-shit-done', 'bin', 'gsd-tools.cjs');
+
+describe('bug #1998: phase complete updates overview checkbox', () => {
+  let tmpDir;
+  let planningDir;
+
+  beforeEach(() => {
+    tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'gsd-1998-'));
+    planningDir = path.join(tmpDir, '.planning');
+    fs.mkdirSync(planningDir, { recursive: true });
+
+    // Minimal config
+    fs.writeFileSync(
+      path.join(planningDir, 'config.json'),
+      JSON.stringify({ project_code: 'TEST' })
+    );
+
+    // Minimal STATE.md
+    fs.writeFileSync(
+      path.join(planningDir, 'STATE.md'),
+      '---\ncurrent_phase: 1\nstatus: executing\n---\n# State\n'
+    );
+  });
+
+  afterEach(() => {
+    fs.rmSync(tmpDir, { recursive: true, force: true });
+  });
+
+  test('checkbox updated when no archived milestones exist', () => {
+    const phasesDir = path.join(planningDir, 'phases', '01-foundation');
+    fs.mkdirSync(phasesDir, { recursive: true });
+    fs.writeFileSync(
+      path.join(phasesDir, '01-1-SUMMARY.md'),
+      '---\nstatus: complete\n---\n# Summary\nDone.'
+    );
+    fs.writeFileSync(
+      path.join(phasesDir, '01-1-PLAN.md'),
+      '---\nphase: 1\nplan: 1\n---\n# Plan 1\n'
+    );
+
+    const roadmapPath = path.join(planningDir, 'ROADMAP.md');
+    fs.writeFileSync(roadmapPath, [
+      '# Roadmap',
+      '',
+      '## Phases',
+      '',
+      '- [ ] **Phase 1: Foundation** - core setup',
+      '- [ ] **Phase 2: Features** - add features',
+      '',
+      '## Progress',
+      '',
+      '| Phase | Plans | Status | Completed |',
+      '|-------|-------|--------|-----------|',
+      '| 1. Foundation | 0/1 | Pending | - |',
+      '| 2. Features | 0/1 | Pending | - |',
+    ].join('\n'));
+
+    try {
+      execFileSync('node', [gsdTools, 'phase', 'complete', '1'], { cwd: tmpDir, timeout: 10000 });
+    } catch {
+      // Command may exit non-zero if STATE.md update fails, but ROADMAP.md update happens first
+    }
+
+    const result = fs.readFileSync(roadmapPath, 'utf-8');
+    assert.match(result, /- \[x\] \*\*Phase 1: Foundation\*\*/, 'overview checkbox should be checked');
+    assert.match(result, /- \[ \] \*\*Phase 2: Features\*\*/, 'phase 2 checkbox should remain unchecked');
+  });
+
+  test('checkbox updated when archived milestones exist in <details>', () => {
+    const phasesDir = path.join(planningDir, 'phases', '01-setup');
+    fs.mkdirSync(phasesDir, { recursive: true });
+    fs.writeFileSync(
+      path.join(phasesDir, '01-1-SUMMARY.md'),
+      '---\nstatus: complete\n---\n# Summary\nDone.'
+    );
+    fs.writeFileSync(
+      path.join(phasesDir, '01-1-PLAN.md'),
+      '---\nphase: 1\nplan: 1\n---\n# Plan 1\n'
+    );
+
+    const roadmapPath = path.join(planningDir, 'ROADMAP.md');
+    fs.writeFileSync(roadmapPath, [
+      '# Roadmap v2.0',
+      '',
+      '## Phases',
+      '',
+      '- [ ] **Phase 1: Setup** - initial setup',
+      '- [ ] **Phase 2: Build** - build features',
+      '',
+      '## Progress',
+      '',
+      '| Phase | Plans | Status | Completed |',
+      '|-------|-------|--------|-----------|',
+      '| 1. Setup | 0/1 | Pending | - |',
+      '| 2. Build | 0/1 | Pending | - |',
+      '',
+      '<details>',
+      '<summary>v1.0 (Archived)</summary>',
+      '',
+      '## v1.0 Phases',
+      '- [x] **Phase 1: Init** - initialization',
+      '- [x] **Phase 2: Deploy** - deployment',
+      '',
+      '</details>',
+    ].join('\n'));
+
+    try {
+      execFileSync('node', [gsdTools, 'phase', 'complete', '1'], { cwd: tmpDir, timeout: 10000 });
+    } catch {
+      // May exit non-zero
+    }
+
+    const result = fs.readFileSync(roadmapPath, 'utf-8');
+    assert.match(result, /- \[x\] \*\*Phase 1: Setup\*\*/, 'current milestone checkbox should be checked');
+    assert.match(result, /- \[ \] \*\*Phase 2: Build\*\*/, 'phase 2 checkbox should remain unchecked');
+  });
+});
--- a/Show More
+++ b/Show More