mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
fix(digest-dedup): CLUSTERING typo fallback fails closed to complete-link (#3331)
DIGEST_DEDUP_CLUSTERING previously fell to 'single' on unrecognised values, which silently defeated the documented kill switch. A typo like `DIGEST_DEDUP_CLUSTERING=complet` during an over-merge incident would stick with the aggressive single-link merger instead of rolling back to the conservative complete-link algorithm. Mirror the DIGEST_DEDUP_MODE typo pattern (PR #3247): - Unrecognised value → fall to 'complete' (SAFE / conservative). - Surface the raw value via new `invalidClusteringRaw` config field. - Emit a warn line on the dedup orchestrator's entry path so operators see the typo alongside the kill-switch-took-effect message. Valid values 'single' (default), 'complete', unset, empty, and any case variation all behave unchanged. Only true typos change behaviour — and the new behaviour is the kill-switch-safe one. Tests: updated the existing case that codified the old behaviour plus added coverage for (a) multiple typo variants falling to complete with invalidClusteringRaw set, (b) case-insensitive valid values not triggering the typo path, and (c) the orchestrator emitting the warn line even on the jaccard-kill-switch codepath (since CLUSTERING intent applies to both modes). 81/81 dedup tests pass.
This commit is contained in:
@@ -60,6 +60,7 @@ import { defaultRedisPipeline } from './_upstash-pipeline.mjs';
|
||||
* topicGroupingEnabled: boolean,
|
||||
* topicThreshold: number,
|
||||
* invalidModeRaw: string | null,
|
||||
* invalidClusteringRaw: string | null,
|
||||
* }}
|
||||
*/
|
||||
export function readOrchestratorConfig(env = process.env) {
|
||||
@@ -82,16 +83,30 @@ export function readOrchestratorConfig(env = process.env) {
|
||||
invalidModeRaw = modeRaw;
|
||||
}
|
||||
|
||||
// DIGEST_DEDUP_CLUSTERING = 'single' (default) | 'complete'.
|
||||
// Single-link chains wire variants that share a strong
|
||||
// intermediate headline (calibrated F1 0.73 vs complete-link 0.53
|
||||
// on real brief output). Flip to 'complete' for instant kill
|
||||
// switch if single-link ever over-merges in production.
|
||||
// DIGEST_DEDUP_CLUSTERING = 'single' (default when unset) | 'complete'.
|
||||
// Single-link chains wire variants that share a strong intermediate
|
||||
// headline (calibrated F1 0.73 vs complete-link 0.53 on real brief
|
||||
// output). 'complete' is the documented kill switch for when single-
|
||||
// link over-merges in production.
|
||||
//
|
||||
// Typo handling mirrors the MODE branch above: an unrecognised value
|
||||
// falls to 'complete' (the SAFE / conservative algorithm), not back
|
||||
// to 'single'. Rationale: if an operator is typing
|
||||
// `DIGEST_DEDUP_CLUSTERING=complet` during an over-merge incident,
|
||||
// silently sticking with the aggressive merger defeats the kill
|
||||
// switch. The invalidClusteringRaw warn surfaces the typo so it's
|
||||
// fixed, but the fail-closed default protects the cron meanwhile.
|
||||
const clusteringRaw = (env.DIGEST_DEDUP_CLUSTERING ?? '').toLowerCase();
|
||||
const clustering =
|
||||
clusteringRaw === 'complete' ? 'complete'
|
||||
: clusteringRaw === 'single' || clusteringRaw === '' ? 'single'
|
||||
: 'single';
|
||||
let clustering;
|
||||
let invalidClusteringRaw = null;
|
||||
if (clusteringRaw === '' || clusteringRaw === 'single') {
|
||||
clustering = 'single';
|
||||
} else if (clusteringRaw === 'complete') {
|
||||
clustering = 'complete';
|
||||
} else {
|
||||
clustering = 'complete';
|
||||
invalidClusteringRaw = clusteringRaw;
|
||||
}
|
||||
|
||||
const cosineRaw = Number.parseFloat(env.DIGEST_DEDUP_COSINE_THRESHOLD ?? '');
|
||||
const cosineThreshold =
|
||||
@@ -123,6 +138,7 @@ export function readOrchestratorConfig(env = process.env) {
|
||||
topicGroupingEnabled,
|
||||
topicThreshold,
|
||||
invalidModeRaw,
|
||||
invalidClusteringRaw,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -160,6 +176,12 @@ export async function deduplicateStories(stories, deps = {}) {
|
||||
'falling back to jaccard (safe rollback path). Valid values: embed | jaccard.',
|
||||
);
|
||||
}
|
||||
if (cfg.invalidClusteringRaw !== null) {
|
||||
warn(
|
||||
`[digest] dedup unrecognised DIGEST_DEDUP_CLUSTERING=${cfg.invalidClusteringRaw} — ` +
|
||||
'falling back to complete-link (safe / conservative). Valid values: single | complete.',
|
||||
);
|
||||
}
|
||||
|
||||
if (!Array.isArray(stories) || stories.length === 0) {
|
||||
return { reps: [], embeddingByHash: new Map(), logSummary: '' };
|
||||
|
||||
@@ -685,10 +685,52 @@ describe('readOrchestratorConfig — DIGEST_DEDUP_CLUSTERING', () => {
|
||||
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' });
|
||||
assert.equal(cfg.clustering, 'complete');
|
||||
});
|
||||
it('unrecognised values fall back to single (default, safe)', async () => {
|
||||
it('unrecognised values fall back to COMPLETE (fail-closed kill switch), surfaces invalidClusteringRaw', async () => {
|
||||
// Mirrors the MODE typo contract: a typo like CLUSTERING=complet
|
||||
// during an over-merge incident must NOT silently stick with the
|
||||
// aggressive 'single' merger — that defeats the kill switch. Fall
|
||||
// to the SAFE conservative algorithm ('complete') and surface the
|
||||
// raw value so the typo is visible in logs.
|
||||
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
||||
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'average' });
|
||||
assert.equal(cfg.clustering, 'single');
|
||||
for (const raw of ['average', 'complet', 'SINGLE ', 'xyz']) {
|
||||
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: raw });
|
||||
assert.equal(cfg.clustering, 'complete', `raw=${JSON.stringify(raw)} should fall to complete`);
|
||||
assert.equal(cfg.invalidClusteringRaw, raw.toLowerCase(), `raw=${JSON.stringify(raw)} should surface as invalidClusteringRaw`);
|
||||
}
|
||||
});
|
||||
it('case-insensitive on valid values (single/SINGLE/Complete all work)', async () => {
|
||||
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
||||
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'SINGLE' }).clustering, 'single');
|
||||
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'Complete' }).clustering, 'complete');
|
||||
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' }).invalidClusteringRaw, null);
|
||||
});
|
||||
it('explicit "single" and unset produce invalidClusteringRaw=null', async () => {
|
||||
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
||||
for (const env of [{}, { DIGEST_DEDUP_CLUSTERING: 'single' }, { DIGEST_DEDUP_CLUSTERING: '' }]) {
|
||||
const cfg = readOrchestratorConfig(env);
|
||||
assert.equal(cfg.clustering, 'single');
|
||||
assert.equal(cfg.invalidClusteringRaw, null);
|
||||
}
|
||||
});
|
||||
it('explicit "complete" produces invalidClusteringRaw=null', async () => {
|
||||
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
||||
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' });
|
||||
assert.equal(cfg.clustering, 'complete');
|
||||
assert.equal(cfg.invalidClusteringRaw, null);
|
||||
});
|
||||
it('deduplicateStories emits warn line on CLUSTERING typo', async () => {
|
||||
const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs');
|
||||
const warns = [];
|
||||
await deduplicateStories([], {
|
||||
env: { DIGEST_DEDUP_MODE: 'jaccard', DIGEST_DEDUP_CLUSTERING: 'complet' },
|
||||
warn: (line) => warns.push(line),
|
||||
});
|
||||
// Even the jaccard-kill-switch path must surface the CLUSTERING typo
|
||||
// since the operator intent (conservative path) is valid in both modes.
|
||||
assert.ok(
|
||||
warns.some((w) => /DIGEST_DEDUP_CLUSTERING=complet/.test(w) && /complete-link/.test(w)),
|
||||
`expected typo warn; got: ${JSON.stringify(warns)}`,
|
||||
);
|
||||
});
|
||||
it('structured logSummary includes clustering=<algo>', async () => {
|
||||
const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs');
|
||||
|
||||
Reference in New Issue
Block a user