mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
fix(digest-dedup): CLUSTERING typo fallback fails closed to complete-link (#3331)
DIGEST_DEDUP_CLUSTERING previously fell to 'single' on unrecognised values, which silently defeated the documented kill switch. A typo like `DIGEST_DEDUP_CLUSTERING=complet` during an over-merge incident would stick with the aggressive single-link merger instead of rolling back to the conservative complete-link algorithm. Mirror the DIGEST_DEDUP_MODE typo pattern (PR #3247): - Unrecognised value → fall to 'complete' (SAFE / conservative). - Surface the raw value via new `invalidClusteringRaw` config field. - Emit a warn line on the dedup orchestrator's entry path so operators see the typo alongside the kill-switch-took-effect message. Valid values 'single' (default), 'complete', unset, empty, and any case variation all behave unchanged. Only true typos change behaviour — and the new behaviour is the kill-switch-safe one. Tests: updated the existing case that codified the old behaviour plus added coverage for (a) multiple typo variants falling to complete with invalidClusteringRaw set, (b) case-insensitive valid values not triggering the typo path, and (c) the orchestrator emitting the warn line even on the jaccard-kill-switch codepath (since CLUSTERING intent applies to both modes). 81/81 dedup tests pass.
This commit is contained in:
@@ -60,6 +60,7 @@ import { defaultRedisPipeline } from './_upstash-pipeline.mjs';
|
|||||||
* topicGroupingEnabled: boolean,
|
* topicGroupingEnabled: boolean,
|
||||||
* topicThreshold: number,
|
* topicThreshold: number,
|
||||||
* invalidModeRaw: string | null,
|
* invalidModeRaw: string | null,
|
||||||
|
* invalidClusteringRaw: string | null,
|
||||||
* }}
|
* }}
|
||||||
*/
|
*/
|
||||||
export function readOrchestratorConfig(env = process.env) {
|
export function readOrchestratorConfig(env = process.env) {
|
||||||
@@ -82,16 +83,30 @@ export function readOrchestratorConfig(env = process.env) {
|
|||||||
invalidModeRaw = modeRaw;
|
invalidModeRaw = modeRaw;
|
||||||
}
|
}
|
||||||
|
|
||||||
// DIGEST_DEDUP_CLUSTERING = 'single' (default) | 'complete'.
|
// DIGEST_DEDUP_CLUSTERING = 'single' (default when unset) | 'complete'.
|
||||||
// Single-link chains wire variants that share a strong
|
// Single-link chains wire variants that share a strong intermediate
|
||||||
// intermediate headline (calibrated F1 0.73 vs complete-link 0.53
|
// headline (calibrated F1 0.73 vs complete-link 0.53 on real brief
|
||||||
// on real brief output). Flip to 'complete' for instant kill
|
// output). 'complete' is the documented kill switch for when single-
|
||||||
// switch if single-link ever over-merges in production.
|
// link over-merges in production.
|
||||||
|
//
|
||||||
|
// Typo handling mirrors the MODE branch above: an unrecognised value
|
||||||
|
// falls to 'complete' (the SAFE / conservative algorithm), not back
|
||||||
|
// to 'single'. Rationale: if an operator is typing
|
||||||
|
// `DIGEST_DEDUP_CLUSTERING=complet` during an over-merge incident,
|
||||||
|
// silently sticking with the aggressive merger defeats the kill
|
||||||
|
// switch. The invalidClusteringRaw warn surfaces the typo so it's
|
||||||
|
// fixed, but the fail-closed default protects the cron meanwhile.
|
||||||
const clusteringRaw = (env.DIGEST_DEDUP_CLUSTERING ?? '').toLowerCase();
|
const clusteringRaw = (env.DIGEST_DEDUP_CLUSTERING ?? '').toLowerCase();
|
||||||
const clustering =
|
let clustering;
|
||||||
clusteringRaw === 'complete' ? 'complete'
|
let invalidClusteringRaw = null;
|
||||||
: clusteringRaw === 'single' || clusteringRaw === '' ? 'single'
|
if (clusteringRaw === '' || clusteringRaw === 'single') {
|
||||||
: 'single';
|
clustering = 'single';
|
||||||
|
} else if (clusteringRaw === 'complete') {
|
||||||
|
clustering = 'complete';
|
||||||
|
} else {
|
||||||
|
clustering = 'complete';
|
||||||
|
invalidClusteringRaw = clusteringRaw;
|
||||||
|
}
|
||||||
|
|
||||||
const cosineRaw = Number.parseFloat(env.DIGEST_DEDUP_COSINE_THRESHOLD ?? '');
|
const cosineRaw = Number.parseFloat(env.DIGEST_DEDUP_COSINE_THRESHOLD ?? '');
|
||||||
const cosineThreshold =
|
const cosineThreshold =
|
||||||
@@ -123,6 +138,7 @@ export function readOrchestratorConfig(env = process.env) {
|
|||||||
topicGroupingEnabled,
|
topicGroupingEnabled,
|
||||||
topicThreshold,
|
topicThreshold,
|
||||||
invalidModeRaw,
|
invalidModeRaw,
|
||||||
|
invalidClusteringRaw,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -160,6 +176,12 @@ export async function deduplicateStories(stories, deps = {}) {
|
|||||||
'falling back to jaccard (safe rollback path). Valid values: embed | jaccard.',
|
'falling back to jaccard (safe rollback path). Valid values: embed | jaccard.',
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (cfg.invalidClusteringRaw !== null) {
|
||||||
|
warn(
|
||||||
|
`[digest] dedup unrecognised DIGEST_DEDUP_CLUSTERING=${cfg.invalidClusteringRaw} — ` +
|
||||||
|
'falling back to complete-link (safe / conservative). Valid values: single | complete.',
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if (!Array.isArray(stories) || stories.length === 0) {
|
if (!Array.isArray(stories) || stories.length === 0) {
|
||||||
return { reps: [], embeddingByHash: new Map(), logSummary: '' };
|
return { reps: [], embeddingByHash: new Map(), logSummary: '' };
|
||||||
|
|||||||
@@ -685,10 +685,52 @@ describe('readOrchestratorConfig — DIGEST_DEDUP_CLUSTERING', () => {
|
|||||||
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' });
|
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' });
|
||||||
assert.equal(cfg.clustering, 'complete');
|
assert.equal(cfg.clustering, 'complete');
|
||||||
});
|
});
|
||||||
it('unrecognised values fall back to single (default, safe)', async () => {
|
it('unrecognised values fall back to COMPLETE (fail-closed kill switch), surfaces invalidClusteringRaw', async () => {
|
||||||
|
// Mirrors the MODE typo contract: a typo like CLUSTERING=complet
|
||||||
|
// during an over-merge incident must NOT silently stick with the
|
||||||
|
// aggressive 'single' merger — that defeats the kill switch. Fall
|
||||||
|
// to the SAFE conservative algorithm ('complete') and surface the
|
||||||
|
// raw value so the typo is visible in logs.
|
||||||
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
||||||
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'average' });
|
for (const raw of ['average', 'complet', 'SINGLE ', 'xyz']) {
|
||||||
assert.equal(cfg.clustering, 'single');
|
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: raw });
|
||||||
|
assert.equal(cfg.clustering, 'complete', `raw=${JSON.stringify(raw)} should fall to complete`);
|
||||||
|
assert.equal(cfg.invalidClusteringRaw, raw.toLowerCase(), `raw=${JSON.stringify(raw)} should surface as invalidClusteringRaw`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
it('case-insensitive on valid values (single/SINGLE/Complete all work)', async () => {
|
||||||
|
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
||||||
|
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'SINGLE' }).clustering, 'single');
|
||||||
|
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'Complete' }).clustering, 'complete');
|
||||||
|
assert.equal(readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' }).invalidClusteringRaw, null);
|
||||||
|
});
|
||||||
|
it('explicit "single" and unset produce invalidClusteringRaw=null', async () => {
|
||||||
|
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
||||||
|
for (const env of [{}, { DIGEST_DEDUP_CLUSTERING: 'single' }, { DIGEST_DEDUP_CLUSTERING: '' }]) {
|
||||||
|
const cfg = readOrchestratorConfig(env);
|
||||||
|
assert.equal(cfg.clustering, 'single');
|
||||||
|
assert.equal(cfg.invalidClusteringRaw, null);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
it('explicit "complete" produces invalidClusteringRaw=null', async () => {
|
||||||
|
const { readOrchestratorConfig } = await import('../scripts/lib/brief-dedup.mjs');
|
||||||
|
const cfg = readOrchestratorConfig({ DIGEST_DEDUP_CLUSTERING: 'complete' });
|
||||||
|
assert.equal(cfg.clustering, 'complete');
|
||||||
|
assert.equal(cfg.invalidClusteringRaw, null);
|
||||||
|
});
|
||||||
|
it('deduplicateStories emits warn line on CLUSTERING typo', async () => {
|
||||||
|
const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs');
|
||||||
|
const warns = [];
|
||||||
|
await deduplicateStories([], {
|
||||||
|
env: { DIGEST_DEDUP_MODE: 'jaccard', DIGEST_DEDUP_CLUSTERING: 'complet' },
|
||||||
|
warn: (line) => warns.push(line),
|
||||||
|
});
|
||||||
|
// Even the jaccard-kill-switch path must surface the CLUSTERING typo
|
||||||
|
// since the operator intent (conservative path) is valid in both modes.
|
||||||
|
assert.ok(
|
||||||
|
warns.some((w) => /DIGEST_DEDUP_CLUSTERING=complet/.test(w) && /complete-link/.test(w)),
|
||||||
|
`expected typo warn; got: ${JSON.stringify(warns)}`,
|
||||||
|
);
|
||||||
});
|
});
|
||||||
it('structured logSummary includes clustering=<algo>', async () => {
|
it('structured logSummary includes clustering=<algo>', async () => {
|
||||||
const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs');
|
const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs');
|
||||||
|
|||||||
Reference in New Issue
Block a user