fix: backfill Chroma vector DB for all projects on startup (#1154)

* fix: backfill all Chroma projects on worker startup

ChromaSync.ensureBackfilled() existed but was never called. After
v10.2.2's bun cache clear destroyed the ONNX model cache, Chroma only
had ~2 days of embeddings while SQLite had 49k+ observations.

- Add static backfillAllProjects() to ChromaSync — iterates all projects
  in SQLite, creates temporary ChromaSync per project, runs smart diff
- Call backfillAllProjects() fire-and-forget on worker startup
- Add 'CHROMA_SYNC' to logger Component type (pre-existing gap)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: sanitize project names for Chroma collection naming

Replace characters outside [a-zA-Z0-9._-] with underscores so projects
like "YC Stuff" map to collection "cm__YC_Stuff" instead of failing
Chroma's collection name validation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: route backfill to shared cm__claude-mem collection, harden sanitization

- Use single ChromaSync('claude-mem') in backfillAllProjects() instead of
  per-project instances, matching how DatabaseManager and SearchManager
  operate — fixes critical bug where backfilled data landed in orphaned
  collections that no search path reads from
- Strip trailing non-alphanumeric chars from sanitized collection names
  to satisfy Chroma's end-character constraint
- Guard backfill behind Chroma server readiness to avoid N spurious error
  logs when Chroma failed to start
- Use CHROMA_SYNC log component consistently for backfill messages

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* refactor: pass project as parameter to ensureBackfilled instead of mutating instance state

Eliminates shared mutable state in backfillAllProjects() loop. Project
scoping is now passed explicitly via parameter to both ensureBackfilled()
and getExistingChromaIds(), keeping a single Chroma connection while
avoiding fragile instance property mutation across iterations.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-17 22:47:46 -05:00
committed by GitHub
parent eea4f599c0
commit ca8421611c
5 changed files with 294 additions and 243 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -85,7 +85,12 @@ export class ChromaSync {
constructor(project: string) {
this.project = project;
this.collectionName = `cm__${project}`;
// Chroma collection names only allow [a-zA-Z0-9._-], 3-512 chars,
// must start/end with [a-zA-Z0-9]
const sanitized = project
.replace(/[^a-zA-Z0-9._-]/g, '_')
.replace(/[^a-zA-Z0-9]+$/, ''); // strip trailing non-alphanumeric
this.collectionName = `cm__${sanitized || 'unknown'}`;
this.VECTOR_DB_DIR = path.join(os.homedir(), '.claude-mem', 'vector-db');
}
@@ -543,17 +548,18 @@ export class ChromaSync {
* Fetch all existing document IDs from Chroma collection
* Returns Sets of SQLite IDs for observations, summaries, and prompts
*/
private async getExistingChromaIds(): Promise<{
private async getExistingChromaIds(projectOverride?: string): Promise<{
observations: Set<number>;
summaries: Set<number>;
prompts: Set<number>;
}> {
const targetProject = projectOverride ?? this.project;
await this.ensureCollection();
if (!this.collection) {
throw new Error(
'Chroma collection not initialized. Call ensureCollection() before using collection methods.' +
` Project: ${this.project}`
` Project: ${targetProject}`
);
}
@@ -564,14 +570,14 @@ export class ChromaSync {
let offset = 0;
const limit = 1000; // Large batches, metadata only = fast
logger.info('CHROMA_SYNC', 'Fetching existing Chroma document IDs...', { project: this.project });
logger.info('CHROMA_SYNC', 'Fetching existing Chroma document IDs...', { project: targetProject });
while (true) {
try {
const result = await this.collection.get({
limit,
offset,
where: { project: this.project },
where: { project: targetProject },
include: ['metadatas']
});
@@ -598,18 +604,18 @@ export class ChromaSync {
offset += limit;
logger.debug('CHROMA_SYNC', 'Fetched batch of existing IDs', {
project: this.project,
project: targetProject,
offset,
batchSize: metadatas.length
});
} catch (error) {
logger.error('CHROMA_SYNC', 'Failed to fetch existing IDs', { project: this.project }, error as Error);
logger.error('CHROMA_SYNC', 'Failed to fetch existing IDs', { project: targetProject }, error as Error);
throw error;
}
}
logger.info('CHROMA_SYNC', 'Existing IDs fetched', {
project: this.project,
project: targetProject,
observations: observationIds.size,
summaries: summaryIds.size,
prompts: promptIds.size
@@ -621,15 +627,18 @@ export class ChromaSync {
/**
* Backfill: Sync all observations missing from Chroma
* Reads from SQLite and syncs in batches
* @param projectOverride - If provided, backfill this project instead of this.project.
* Used by backfillAllProjects() to iterate projects without mutating instance state.
* Throws error if backfill fails
*/
async ensureBackfilled(): Promise<void> {
logger.info('CHROMA_SYNC', 'Starting smart backfill', { project: this.project });
async ensureBackfilled(projectOverride?: string): Promise<void> {
const backfillProject = projectOverride ?? this.project;
logger.info('CHROMA_SYNC', 'Starting smart backfill', { project: backfillProject });
await this.ensureCollection();
// Fetch existing IDs from Chroma (fast, metadata only)
const existing = await this.getExistingChromaIds();
const existing = await this.getExistingChromaIds(backfillProject);
const db = new SessionStore();
@@ -645,14 +654,14 @@ export class ChromaSync {
SELECT * FROM observations
WHERE project = ? ${obsExclusionClause}
ORDER BY id ASC
`).all(this.project) as StoredObservation[];
`).all(backfillProject) as StoredObservation[];
const totalObsCount = db.db.prepare(`
SELECT COUNT(*) as count FROM observations WHERE project = ?
`).get(this.project) as { count: number };
`).get(backfillProject) as { count: number };
logger.info('CHROMA_SYNC', 'Backfilling observations', {
project: this.project,
project: backfillProject,
missing: observations.length,
existing: existing.observations.size,
total: totalObsCount.count
@@ -670,7 +679,7 @@ export class ChromaSync {
await this.addDocuments(batch);
logger.debug('CHROMA_SYNC', 'Backfill progress', {
project: this.project,
project: backfillProject,
progress: `${Math.min(i + this.BATCH_SIZE, allDocs.length)}/${allDocs.length}`
});
}
@@ -686,14 +695,14 @@ export class ChromaSync {
SELECT * FROM session_summaries
WHERE project = ? ${summaryExclusionClause}
ORDER BY id ASC
`).all(this.project) as StoredSummary[];
`).all(backfillProject) as StoredSummary[];
const totalSummaryCount = db.db.prepare(`
SELECT COUNT(*) as count FROM session_summaries WHERE project = ?
`).get(this.project) as { count: number };
`).get(backfillProject) as { count: number };
logger.info('CHROMA_SYNC', 'Backfilling summaries', {
project: this.project,
project: backfillProject,
missing: summaries.length,
existing: existing.summaries.size,
total: totalSummaryCount.count
@@ -711,7 +720,7 @@ export class ChromaSync {
await this.addDocuments(batch);
logger.debug('CHROMA_SYNC', 'Backfill progress', {
project: this.project,
project: backfillProject,
progress: `${Math.min(i + this.BATCH_SIZE, summaryDocs.length)}/${summaryDocs.length}`
});
}
@@ -732,17 +741,17 @@ export class ChromaSync {
JOIN sdk_sessions s ON up.content_session_id = s.content_session_id
WHERE s.project = ? ${promptExclusionClause}
ORDER BY up.id ASC
`).all(this.project) as StoredUserPrompt[];
`).all(backfillProject) as StoredUserPrompt[];
const totalPromptCount = db.db.prepare(`
SELECT COUNT(*) as count
FROM user_prompts up
JOIN sdk_sessions s ON up.content_session_id = s.content_session_id
WHERE s.project = ?
`).get(this.project) as { count: number };
`).get(backfillProject) as { count: number };
logger.info('CHROMA_SYNC', 'Backfilling user prompts', {
project: this.project,
project: backfillProject,
missing: prompts.length,
existing: existing.prompts.size,
total: totalPromptCount.count
@@ -760,13 +769,13 @@ export class ChromaSync {
await this.addDocuments(batch);
logger.debug('CHROMA_SYNC', 'Backfill progress', {
project: this.project,
project: backfillProject,
progress: `${Math.min(i + this.BATCH_SIZE, promptDocs.length)}/${promptDocs.length}`
});
}
logger.info('CHROMA_SYNC', 'Smart backfill complete', {
project: this.project,
project: backfillProject,
synced: {
observationDocs: allDocs.length,
summaryDocs: summaryDocs.length,
@@ -780,7 +789,7 @@ export class ChromaSync {
});
} catch (error) {
logger.error('CHROMA_SYNC', 'Backfill failed', { project: this.project }, error as Error);
logger.error('CHROMA_SYNC', 'Backfill failed', { project: backfillProject }, error as Error);
throw new Error(`Backfill failed: ${error instanceof Error ? error.message : String(error)}`);
} finally {
db.close();
@@ -867,6 +876,38 @@ export class ChromaSync {
}
}
/**
* Backfill all projects that have observations in SQLite but may be missing from Chroma.
* Uses a single shared ChromaSync('claude-mem') instance and Chroma connection.
* Per-project scoping is passed as a parameter to ensureBackfilled(), avoiding
* instance state mutation. All documents land in the cm__claude-mem collection
* with project scoped via metadata, matching how DatabaseManager and SearchManager operate.
* Designed to be called fire-and-forget on worker startup.
*/
static async backfillAllProjects(): Promise<void> {
const db = new SessionStore();
const sync = new ChromaSync('claude-mem');
try {
const projects = db.db.prepare(
'SELECT DISTINCT project FROM observations WHERE project IS NOT NULL AND project != ?'
).all('') as { project: string }[];
logger.info('CHROMA_SYNC', `Backfill check for ${projects.length} projects`);
for (const { project } of projects) {
try {
await sync.ensureBackfilled(project);
} catch (error) {
logger.error('CHROMA_SYNC', `Backfill failed for project: ${project}`, {}, error as Error);
// Continue to next project — don't let one failure stop others
}
}
} finally {
await sync.close();
db.close();
}
}
/**
* Close the Chroma client connection
* Server lifecycle is managed by ChromaServerManager, not here

View File

@@ -19,6 +19,7 @@ import { SettingsDefaultsManager } from '../shared/SettingsDefaultsManager.js';
import { getAuthMethodDescription } from '../shared/EnvManager.js';
import { logger } from '../utils/logger.js';
import { ChromaServerManager } from './sync/ChromaServerManager.js';
import { ChromaSync } from './sync/ChromaSync.js';
// Windows: avoid repeated spawn popups when startup fails (issue #921)
const WINDOWS_SPAWN_COOLDOWN_MS = 2 * 60 * 1000;
@@ -423,6 +424,15 @@ export class WorkerService {
this.server.registerRoutes(this.searchRoutes);
logger.info('WORKER', 'SearchManager initialized and search routes registered');
// Auto-backfill Chroma for all projects if out of sync with SQLite (fire-and-forget)
if (this.chromaServer !== null || chromaMode !== 'local') {
ChromaSync.backfillAllProjects().then(() => {
logger.info('CHROMA_SYNC', 'Backfill check complete for all projects');
}).catch(error => {
logger.error('CHROMA_SYNC', 'Backfill failed (non-blocking)', {}, error as Error);
});
}
// Connect to MCP server
const mcpServerPath = path.join(__dirname, 'mcp-server.cjs');
const transport = new StdioClientTransport({

View File

@@ -15,7 +15,7 @@ export enum LogLevel {
SILENT = 4
}
export type Component = 'HOOK' | 'WORKER' | 'SDK' | 'PARSER' | 'DB' | 'SYSTEM' | 'HTTP' | 'SESSION' | 'CHROMA' | 'FOLDER_INDEX' | 'CLAUDE_MD';
export type Component = 'HOOK' | 'WORKER' | 'SDK' | 'PARSER' | 'DB' | 'SYSTEM' | 'HTTP' | 'SESSION' | 'CHROMA' | 'CHROMA_SYNC' | 'FOLDER_INDEX' | 'CLAUDE_MD';
interface LogContext {
sessionId?: number;