fix(digest-dedup): CLUSTERING typo fallback fails closed to complete-link (#3331)

DIGEST_DEDUP_CLUSTERING previously fell to 'single' on unrecognised
values, which silently defeated the documented kill switch. A typo
like `DIGEST_DEDUP_CLUSTERING=complet` during an over-merge incident
would stick with the aggressive single-link merger instead of rolling
back to the conservative complete-link algorithm.

Mirror the DIGEST_DEDUP_MODE typo pattern (PR #3247):
  - Unrecognised value → fall to 'complete' (SAFE / conservative).
  - Surface the raw value via new `invalidClusteringRaw` config field.
  - Emit a warn line on the dedup orchestrator's entry path so operators
    see the typo alongside the kill-switch-took-effect message.

Valid values 'single' (default), 'complete', unset, empty, and any
case variation all behave unchanged. Only true typos change behaviour
— and the new behaviour is the kill-switch-safe one.

Tests: updated the existing case that codified the old behaviour plus
added coverage for (a) multiple typo variants falling to complete with
invalidClusteringRaw set, (b) case-insensitive valid values not
triggering the typo path, and (c) the orchestrator emitting the warn
line even on the jaccard-kill-switch codepath (since CLUSTERING intent
applies to both modes).

81/81 dedup tests pass.
This commit is contained in:
Elie Habib
2026-04-23 11:25:05 +04:00
committed by GitHub
parent 58589144a5
commit 1958b34f55
2 changed files with 76 additions and 12 deletions

View File

@@ -60,6 +60,7 @@ import { defaultRedisPipeline } from './_upstash-pipeline.mjs';
* topicGroupingEnabled: boolean, * topicGroupingEnabled: boolean,
* topicThreshold: number, * topicThreshold: number,
* invalidModeRaw: string | null, * invalidModeRaw: string | null,
* invalidClusteringRaw: string | null,
* }} * }}
*/ */
export function readOrchestratorConfig(env = process.env) { export function readOrchestratorConfig(env = process.env) {
@@ -82,16 +83,30 @@ export function readOrchestratorConfig(env = process.env) {
invalidModeRaw = modeRaw; invalidModeRaw = modeRaw;
} }
// DIGEST_DEDUP_CLUSTERING = 'single' (default) | 'complete'. // DIGEST_DEDUP_CLUSTERING = 'single' (default when unset) | 'complete'.
// Single-link chains wire variants that share a strong // Single-link chains wire variants that share a strong intermediate
// intermediate headline (calibrated F1 0.73 vs complete-link 0.53 // headline (calibrated F1 0.73 vs complete-link 0.53 on real brief
// on real brief output). Flip to 'complete' for instant kill // output). 'complete' is the documented kill switch for when single-
// switch if single-link ever over-merges in production. // link over-merges in production.
//
// Typo handling mirrors the MODE branch above: an unrecognised value
// falls to 'complete' (the SAFE / conservative algorithm), not back
// to 'single'. Rationale: if an operator is typing
// `DIGEST_DEDUP_CLUSTERING=complet` during an over-merge incident,
// silently sticking with the aggressive merger defeats the kill
// switch. The invalidClusteringRaw warn surfaces the typo so it's
// fixed, but the fail-closed default protects the cron meanwhile.
const clusteringRaw = (env.DIGEST_DEDUP_CLUSTERING ?? '').toLowerCase(); const clusteringRaw = (env.DIGEST_DEDUP_CLUSTERING ?? '').toLowerCase();
const clustering = let clustering;
clusteringRaw === 'complete' ? 'complete' let invalidClusteringRaw = null;
: clusteringRaw === 'single' || clusteringRaw === '' ? 'single' if (clusteringRaw === '' || clusteringRaw === 'single') {
: 'single'; clustering = 'single';
} else if (clusteringRaw === 'complete') {
clustering = 'complete';
} else {
clustering = 'complete';
invalidClusteringRaw = clusteringRaw;
}
const cosineRaw = Number.parseFloat(env.DIGEST_DEDUP_COSINE_THRESHOLD ?? ''); const cosineRaw = Number.parseFloat(env.DIGEST_DEDUP_COSINE_THRESHOLD ?? '');
const cosineThreshold = const cosineThreshold =
@@ -123,6 +138,7 @@ export function readOrchestratorConfig(env = process.env) {
topicGroupingEnabled, topicGroupingEnabled,
topicThreshold, topicThreshold,
invalidModeRaw, invalidModeRaw,
invalidClusteringRaw,
}; };
} }
@@ -160,6 +176,12 @@ export async function deduplicateStories(stories, deps = {}) {
'falling back to jaccard (safe rollback path). Valid values: embed | jaccard.', 'falling back to jaccard (safe rollback path). Valid values: embed | jaccard.',
); );
} }
if (cfg.invalidClusteringRaw !== null) {
warn(
`[digest] dedup unrecognised DIGEST_DEDUP_CLUSTERING=${cfg.invalidClusteringRaw}` +
'falling back to complete-link (safe / conservative). Valid values: single | complete.',
);
}
if (!Array.isArray(stories) || stories.length === 0) { if (!Array.isArray(stories) || stories.length === 0) {
return { reps: [], embeddingByHash: new Map(), logSummary: '' }; return { reps: [], embeddingByHash: new Map(), logSummary: '' };

View File

@@ -685,10 +685,52 @@ describe('readOrchestratorConfig — DIGEST_DEDUP_CLUSTERING', () => {
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' }); const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' });
assert.equal(cfg.clustering, 'complete'); assert.equal(cfg.clustering, 'complete');
}); });
it('unrecognised values fall back to single (default, safe)', async () => { it('unrecognised values fall back to COMPLETE (fail-closed kill switch), surfaces invalidClusteringRaw', async () => {
// Mirrors the MODE typo contract: a typo like CLUSTERING=complet
// during an over-merge incident must NOT silently stick with the
// aggressive 'single' merger — that defeats the kill switch. Fall
// to the SAFE conservative algorithm ('complete') and surface the
// raw value so the typo is visible in logs.
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs'); const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'average' }); for (const raw of ['average', 'complet', 'SINGLE ', 'xyz']) {
assert.equal(cfg.clustering, 'single'); const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: raw });
assert.equal(cfg.clustering, 'complete', `raw=${JSON.stringify(raw)} should fall to complete`);
assert.equal(cfg.invalidClusteringRaw, raw.toLowerCase(), `raw=${JSON.stringify(raw)} should surface as invalidClusteringRaw`);
}
});
it('case-insensitive on valid values (single/SINGLE/Complete all work)', async () => {
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'SINGLE' }).clustering, 'single');
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'Complete' }).clustering, 'complete');
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' }).invalidClusteringRaw, null);
});
it('explicit "single" and unset produce invalidClusteringRaw=null', async () => {
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
for (const env of [{}, { DIGEST_DEDUP_CLUSTERING: 'single' }, { DIGEST_DEDUP_CLUSTERING: '' }]) {
const cfg = readOrchestratorConfig(env);
assert.equal(cfg.clustering, 'single');
assert.equal(cfg.invalidClusteringRaw, null);
}
});
it('explicit "complete" produces invalidClusteringRaw=null', async () => {
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' });
assert.equal(cfg.clustering, 'complete');
assert.equal(cfg.invalidClusteringRaw, null);
});
it('deduplicateStories emits warn line on CLUSTERING typo', async () => {
const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs');
const warns = [];
await deduplicateStories([], {
env: { DIGEST_DEDUP_MODE: 'jaccard', DIGEST_DEDUP_CLUSTERING: 'complet' },
warn: (line) => warns.push(line),
});
// Even the jaccard-kill-switch path must surface the CLUSTERING typo
// since the operator intent (conservative path) is valid in both modes.
assert.ok(
warns.some((w) => /DIGEST_DEDUP_CLUSTERING=complet/.test(w) && /complete-link/.test(w)),
`expected typo warn; got: ${JSON.stringify(warns)}`,
);
}); });
it('structured logSummary includes clustering=<algo>', async () => { it('structured logSummary includes clustering=<algo>', async () => {
const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs'); const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs');