chore(scripts): replace worktree-remap with cwd-based remap using pending_messages.cwd

The old worktree-remap.ts tried to reconstruct per-session cwd by regex- matching absolute paths that incidentally leaked into observation free-text (files_read, source_input_summary, metadata, user_prompt). That source is derived and lossy: it only hit 1/3498 plain-project sessions in practice. pending_messages.cwd is the structured, authoritative cwd captured from every hook payload — 7,935 of 8,473 rows are populated. cwd-remap.ts uses that column as the source of truth: 1. Pull every distinct cwd from pending_messages.cwd 2. For each cwd, classify with git: - rev-parse --absolute-git-dir vs --git-common-dir → main vs worktree - rev-parse --show-toplevel for the correct leaf (handles cwds that are subdirs of the worktree root) Parent project name = basename(dirname(common-dir)); composite is parent/worktree for worktrees, basename(toplevel) for main repos. 3. For each session, take the EARLIEST pending_messages.cwd (not the dominant one — claude-mem's own hooks run from nested .context/ claude-mem/ directories and would otherwise poison the count). 4. Apply UPDATEs in a single transaction across sdk_sessions, observations, and session_summaries. Auto-backs-up the DB first. Result on a real DB: 41 sessions remapped (vs 1 previously), 1,694 observations and 3,091 session_summaries updated to match. 43 cwds skipped (deleted worktrees / non-repos) are left untouched — no inference when the data isn't there. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-25 17:15:04 +02:00 · 2026-04-16 16:40:37 -07:00
parent 040729beef
commit 148e1892df
2 changed files with 174 additions and 170 deletions
--- a/scripts/cwd-remap.ts
+++ b/scripts/cwd-remap.ts
@@ -0,0 +1,174 @@
+#!/usr/bin/env bun
+/**
+ * cwd-remap — Rewrite sdk_sessions.project (+ observations.project,
+ * session_summaries.project) using the cwd captured per-message in
+ * pending_messages.cwd as the single source of truth.
+ *
+ * For each distinct cwd:
+ *   - git -C <cwd> rev-parse --git-dir  AND  --git-common-dir
+ *     If they differ → worktree. parent = basename(dirname(common-dir)),
+ *     project = parent/<basename(cwd)>.
+ *     Else → project = basename(cwd).
+ *   - If the directory doesn't exist, or git errors, skip that cwd.
+ *
+ * Usage:
+ *   bun scripts/cwd-remap.ts          # dry-run (default)
+ *   bun scripts/cwd-remap.ts --apply  # write updates in a single transaction
+ */
+
+import { Database } from 'bun:sqlite';
+import { homedir } from 'os';
+import { join, basename, dirname } from 'path';
+import { existsSync, copyFileSync } from 'fs';
+import { spawnSync } from 'child_process';
+
+const DB_PATH = join(homedir(), '.claude-mem', 'claude-mem.db');
+const APPLY = process.argv.includes('--apply');
+
+type Classification =
+  | { kind: 'main'; project: string }
+  | { kind: 'worktree'; project: string; parent: string }
+  | { kind: 'skip'; reason: string };
+
+function git(cwd: string, args: string[]): string | null {
+  const r = spawnSync('git', ['-C', cwd, ...args], { encoding: 'utf8' });
+  if (r.status !== 0) return null;
+  return r.stdout.trim();
+}
+
+function classify(cwd: string): Classification {
+  if (!existsSync(cwd)) return { kind: 'skip', reason: 'cwd-missing' };
+
+  const gitDir = git(cwd, ['rev-parse', '--absolute-git-dir']);
+  if (!gitDir) return { kind: 'skip', reason: 'not-a-git-repo' };
+
+  const commonDir = git(cwd, ['rev-parse', '--path-format=absolute', '--git-common-dir']);
+  if (!commonDir) return { kind: 'skip', reason: 'no-common-dir' };
+
+  // Use the worktree root, not the cwd — a session may be in a subdir.
+  const toplevel = git(cwd, ['rev-parse', '--show-toplevel']);
+  if (!toplevel) return { kind: 'skip', reason: 'no-toplevel' };
+  const leaf = basename(toplevel);
+
+  if (gitDir === commonDir) {
+    return { kind: 'main', project: leaf };
+  }
+
+  // worktree: common-dir = <parent-repo>/.git (or <parent>.git for bare)
+  const parentRepoDir = commonDir.endsWith('/.git') ? dirname(commonDir) : dirname(commonDir);
+  const parent = basename(parentRepoDir);
+  return { kind: 'worktree', project: `${parent}/${leaf}`, parent };
+}
+
+function main() {
+  if (!existsSync(DB_PATH)) {
+    console.error(`DB not found at ${DB_PATH}`);
+    process.exit(1);
+  }
+
+  if (APPLY) {
+    const backup = `${DB_PATH}.bak-cwd-remap-${Date.now()}`;
+    copyFileSync(DB_PATH, backup);
+    console.log(`Backup created: ${backup}`);
+  }
+
+  const db = new Database(DB_PATH);
+
+  const cwdRows = db.prepare(`
+    SELECT cwd, COUNT(*) AS messages
+    FROM pending_messages
+    WHERE cwd IS NOT NULL AND cwd != ''
+    GROUP BY cwd
+  `).all() as Array<{ cwd: string; messages: number }>;
+
+  console.log(`Classifying ${cwdRows.length} distinct cwds via git...`);
+
+  const byCwd = new Map<string, Classification>();
+  const counts = { main: 0, worktree: 0, skip: 0 };
+  for (const { cwd } of cwdRows) {
+    const c = classify(cwd);
+    byCwd.set(cwd, c);
+    counts[c.kind]++;
+  }
+  console.log(`  main=${counts.main}  worktree=${counts.worktree}  skip=${counts.skip}`);
+
+  // Skipped cwds (so user sees what's missing)
+  const skipped = [...byCwd.entries()].filter(([, c]) => c.kind === 'skip') as Array<[string, Extract<Classification, { kind: 'skip' }>]>;
+  if (skipped.length) {
+    console.log('\nSkipped cwds:');
+    for (const [cwd, c] of skipped) console.log(`  [${c.reason}] ${cwd}`);
+  }
+
+  // Per-session target: use the EARLIEST pending_messages.cwd for each session.
+  // (Dominant-cwd is wrong: claude-mem's own hooks run from nested dirs like
+  //  `.context/claude-mem/` and dominate the count, misattributing the session.)
+  const sessionRows = db.prepare(`
+    SELECT s.id AS session_id, s.memory_session_id, s.content_session_id, s.project AS old_project, p.cwd
+    FROM sdk_sessions s
+    JOIN pending_messages p ON p.content_session_id = s.content_session_id
+    WHERE p.cwd IS NOT NULL AND p.cwd != ''
+      AND p.id = (
+        SELECT MIN(p2.id) FROM pending_messages p2
+        WHERE p2.content_session_id = s.content_session_id
+          AND p2.cwd IS NOT NULL AND p2.cwd != ''
+      )
+  `).all() as Array<{ session_id: number; memory_session_id: string | null; content_session_id: string; old_project: string; cwd: string }>;
+
+  type Target = { sessionId: number; memorySessionId: string | null; contentSessionId: string; oldProject: string; newProject: string; cwd: string };
+  const perSession = new Map<number, Target>();
+
+  for (const r of sessionRows) {
+    const c = byCwd.get(r.cwd);
+    if (!c || c.kind === 'skip') continue;
+    perSession.set(r.session_id, {
+      sessionId: r.session_id,
+      memorySessionId: r.memory_session_id,
+      contentSessionId: r.content_session_id,
+      oldProject: r.old_project,
+      newProject: c.project,
+      cwd: r.cwd,
+    });
+  }
+
+  const targets = [...perSession.values()].filter(t => t.oldProject !== t.newProject);
+
+  console.log(`\nSessions linked to a classified cwd: ${perSession.size}`);
+  console.log(`Sessions whose project would change: ${targets.length}`);
+
+  const summary = new Map<string, number>();
+  for (const t of targets) {
+    const key = `${t.oldProject}  →  ${t.newProject}`;
+    summary.set(key, (summary.get(key) ?? 0) + 1);
+  }
+  const rows = [...summary.entries()]
+    .map(([mapping, n]) => ({ mapping, sessions: n }))
+    .sort((a, b) => b.sessions - a.sessions);
+  console.log('\nTop mappings:');
+  console.table(rows.slice(0, 30));
+  if (rows.length > 30) console.log(`  …and ${rows.length - 30} more mappings`);
+
+  if (!APPLY) {
+    console.log('\nDry-run only. Re-run with --apply to perform UPDATEs.');
+    return;
+  }
+
+  const updSession = db.prepare('UPDATE sdk_sessions      SET project = ? WHERE id = ?');
+  const updObs     = db.prepare('UPDATE observations      SET project = ? WHERE memory_session_id = ?');
+  const updSum     = db.prepare('UPDATE session_summaries SET project = ? WHERE memory_session_id = ?');
+
+  let sessionN = 0, obsN = 0, sumN = 0;
+  const tx = db.transaction(() => {
+    for (const t of targets) {
+      sessionN += updSession.run(t.newProject, t.sessionId).changes;
+      if (t.memorySessionId) {
+        obsN += updObs.run(t.newProject, t.memorySessionId).changes;
+        sumN += updSum.run(t.newProject, t.memorySessionId).changes;
+      }
+    }
+  });
+  tx();
+
+  console.log(`\nApplied. sessions=${sessionN} observations=${obsN} session_summaries=${sumN}`);
+}
+
+main();
--- a/scripts/worktree-remap.ts
+++ b/scripts/worktree-remap.ts
@@ -1,170 +0,0 @@
-#!/usr/bin/env bun
-/**
- * worktree-remap — Retroactively reattribute past sessions that were written
- * with a plain project name (e.g. `claude-mem`) to the `parent/worktree`
- * composite name when the original worktree can be inferred from the paths
- * in the session's observations or user prompt.
- *
- * Only sessions with HIGH-CONFIDENCE worktree path signatures are remapped.
- * Everything else is left alone.
- *
- * Usage:
- *   bun scripts/worktree-remap.ts           # dry-run (default)
- *   bun scripts/worktree-remap.ts --apply   # write changes in a transaction
- */
-
-import { Database } from 'bun:sqlite';
-import { homedir } from 'os';
-import { join } from 'path';
-import { existsSync, copyFileSync } from 'fs';
-
-const DB_PATH = join(homedir(), '.claude-mem', 'claude-mem.db');
-const APPLY = process.argv.includes('--apply');
-
-const WORKTREE_PATTERNS: Array<{ name: string; regex: RegExp }> = [
-  { name: 'conductor', regex: /\/conductor\/workspaces\/([^/]+)\/([^/"'\s)]+)/ },
-  { name: 'superset',  regex: /\/\.superset\/worktrees\/([^/]+)\/([^/"'\s)]+)/ },
-];
-
-interface SessionRow {
-  id: number;
-  memory_session_id: string | null;
-  project: string;
-  user_prompt: string | null;
-}
-
-function allMatches(text: string | null | undefined): Array<{ parent: string; worktree: string }> {
-  if (!text) return [];
-  const results: Array<{ parent: string; worktree: string }> = [];
-  for (const p of WORKTREE_PATTERNS) {
-    const global = new RegExp(p.regex.source, 'g');
-    let m: RegExpExecArray | null;
-    while ((m = global.exec(text)) !== null) {
-      results.push({ parent: m[1], worktree: m[2] });
-    }
-  }
-  return results;
-}
-
-/**
- * Collects every worktree path match across the session's observations + user prompt,
- * then picks the inference using this priority:
- *   1. A match whose worktree basename === the session's current plain project name.
- *      (Pre-#1820 sessions stored the worktree basename as `project` — these are trusted.)
- *   2. If none match the current project, and there's a single unambiguous (parent, worktree)
- *      across ALL signals, use it.
- *   3. Otherwise skip (ambiguous — likely cross-worktree reads).
- */
-function inferWorktree(
-  db: Database,
-  memorySessionId: string | null,
-  userPrompt: string | null,
-  currentProject: string
-): { parent: string; worktree: string } | null {
-  const matches: Array<{ parent: string; worktree: string }> = [];
-
-  if (memorySessionId) {
-    const rows = db.prepare(`
-      SELECT files_read, files_modified, source_input_summary, metadata
-      FROM observations
-      WHERE memory_session_id = ?
-        AND (files_read LIKE '%/conductor/workspaces/%' OR files_modified LIKE '%/conductor/workspaces/%'
-             OR source_input_summary LIKE '%/conductor/workspaces/%' OR metadata LIKE '%/conductor/workspaces/%'
-             OR files_read LIKE '%.superset/worktrees/%' OR files_modified LIKE '%.superset/worktrees/%'
-             OR source_input_summary LIKE '%.superset/worktrees/%' OR metadata LIKE '%.superset/worktrees/%')
-    `).all(memorySessionId) as Array<{ files_read: string | null; files_modified: string | null; source_input_summary: string | null; metadata: string | null }>;
-
-    for (const r of rows) {
-      matches.push(...allMatches(r.files_read));
-      matches.push(...allMatches(r.files_modified));
-      matches.push(...allMatches(r.source_input_summary));
-      matches.push(...allMatches(r.metadata));
-    }
-  }
-
-  matches.push(...allMatches(userPrompt));
-  if (matches.length === 0) return null;
-
-  const wtMatch = matches.find(m => m.worktree === currentProject);
-  if (wtMatch) return wtMatch;
-
-  const signatures = new Set(matches.map(m => `${m.parent}/${m.worktree}`));
-  if (signatures.size === 1) return matches[0];
-
-  return null;
-}
-
-function main() {
-  if (!existsSync(DB_PATH)) {
-    console.error(`DB not found at ${DB_PATH}`);
-    process.exit(1);
-  }
-
-  if (APPLY) {
-    const backup = `${DB_PATH}.bak-worktree-remap-${Date.now()}`;
-    copyFileSync(DB_PATH, backup);
-    console.log(`Backup created: ${backup}`);
-  }
-
-  const db = new Database(DB_PATH);
-
-  const sessions = db.prepare(`
-    SELECT id, memory_session_id, project, user_prompt
-    FROM sdk_sessions
-    WHERE project NOT LIKE '%/%' AND project != ''
-  `).all() as SessionRow[];
-
-  console.log(`Scanning ${sessions.length} plain-project sessions...`);
-
-  type Remap = { sessionId: number; memorySessionId: string | null; oldProject: string; newProject: string };
-  const remaps: Remap[] = [];
-  const summary = new Map<string, { count: number; firstExample: number }>();
-
-  for (const s of sessions) {
-    const hit = inferWorktree(db, s.memory_session_id, s.user_prompt, s.project);
-    if (!hit) continue;
-
-    const newProject = `${hit.parent}/${hit.worktree}`;
-    if (newProject === s.project) continue;
-
-    remaps.push({ sessionId: s.id, memorySessionId: s.memory_session_id, oldProject: s.project, newProject });
-    const key = `${s.project} → ${newProject}`;
-    const entry = summary.get(key);
-    if (entry) entry.count++;
-    else summary.set(key, { count: 1, firstExample: s.id });
-  }
-
-  const rows = Array.from(summary.entries())
-    .map(([mapping, v]) => ({ mapping, sessions: v.count, exampleSessionId: v.firstExample }))
-    .sort((a, b) => b.sessions - a.sessions);
-
-  console.log('\nRemap summary:');
-  console.table(rows);
-  console.log(`\nTotal sessions to remap: ${remaps.length}`);
-
-  if (!APPLY) {
-    console.log('\nDry-run only. Re-run with --apply to perform UPDATEs.');
-    return;
-  }
-
-  console.log('\nApplying updates in a single transaction...');
-  const updateSession = db.prepare('UPDATE sdk_sessions      SET project=? WHERE id=?');
-  const updateObs     = db.prepare('UPDATE observations      SET project=? WHERE memory_session_id=?');
-  const updateSum     = db.prepare('UPDATE session_summaries SET project=? WHERE memory_session_id=?');
-
-  let sessionUpdates = 0, obsUpdates = 0, sumUpdates = 0;
-  const tx = db.transaction(() => {
-    for (const r of remaps) {
-      sessionUpdates += updateSession.run(r.newProject, r.sessionId).changes;
-      if (r.memorySessionId) {
-        obsUpdates += updateObs.run(r.newProject, r.memorySessionId).changes;
-        sumUpdates += updateSum.run(r.newProject, r.memorySessionId).changes;
-      }
-    }
-  });
-  tx();
-
-  console.log(`Done. sessions=${sessionUpdates} observations=${obsUpdates} session_summaries=${sumUpdates}`);
-}
-
-main();