diff --git a/scripts/lib/_upstash-pipeline.mjs b/scripts/lib/_upstash-pipeline.mjs new file mode 100644 index 000000000..50241785b --- /dev/null +++ b/scripts/lib/_upstash-pipeline.mjs @@ -0,0 +1,39 @@ +/** + * Shared Upstash pipeline helper for scripts/lib/* modules. + * + * The canonical helper for api/* code is api/_upstash-json.js:redisPipeline. + * scripts/lib/* modules historically avoid importing from api/, so this + * file exposes the same behaviour (single POST to /pipeline, 10s timeout, + * returns null on failure) without the cross-dir import. + * + * Keep the shape identical to api/_upstash-json.js:redisPipeline so + * callers can be swapped if we ever relax the boundary. + */ + +/** + * @param {Array} commands Upstash pipeline commands + * @param {object} [opts] + * @param {number} [opts.timeoutMs=10_000] + * @returns {Promise | null>} null on any failure + */ +export async function defaultRedisPipeline(commands, { timeoutMs = 10_000 } = {}) { + const url = process.env.UPSTASH_REDIS_REST_URL; + const token = process.env.UPSTASH_REDIS_REST_TOKEN; + if (!url || !token || commands.length === 0) return null; + try { + const resp = await fetch(`${url}/pipeline`, { + method: 'POST', + headers: { + Authorization: `Bearer ${token}`, + 'Content-Type': 'application/json', + 'User-Agent': 'worldmonitor-digest/1.0', + }, + body: JSON.stringify(commands), + signal: AbortSignal.timeout(timeoutMs), + }); + if (!resp.ok) return null; + return await resp.json(); + } catch { + return null; + } +} diff --git a/scripts/lib/brief-dedup-consts.mjs b/scripts/lib/brief-dedup-consts.mjs new file mode 100644 index 000000000..a7f45fc98 --- /dev/null +++ b/scripts/lib/brief-dedup-consts.mjs @@ -0,0 +1,39 @@ +/** + * Tunables for brief-dedup (Jaccard legacy + embedding replacement). + * + * Env-driven helpers are exported as functions so the orchestrator + * reads them at call time, not at module load — Railway env-var flips + * must take effect without a redeploy. + * + * See docs/plans/2026-04-19-001-feat-embedding-based-story-dedup-plan.md. + */ + +// ── Jaccard (legacy path, kept as permanent fallback) ─────────────────── +// Preserves origin/main behaviour byte-for-byte under MODE=jaccard. +// Threshold 0.55 matches the production implementation prior to this PR. +export const JACCARD_MERGE_THRESHOLD = 0.55; + +// ── Embedding / complete-link clustering ──────────────────────────────── +export const EMBED_MODEL = 'openai/text-embedding-3-small'; +export const EMBED_DIMS = 512; + +// Cache key prefix — version segment MUST bump on model or dimension +// change. Silent threshold drift on model upgrade is the documented +// #1 production regression; don't rely on TTL expiry to drain stale +// vectors. +export const CACHE_VERSION = 'v1:text-3-small-512'; +export const CACHE_KEY_PREFIX = `brief:emb:${CACHE_VERSION}`; +export const CACHE_TTL_SECONDS = 14 * 24 * 60 * 60; // 14 days + +// OpenRouter embeddings endpoint (OpenAI-compatible passthrough). +export const OPENROUTER_EMBEDDINGS_URL = 'https://openrouter.ai/api/v1/embeddings'; + +// Env-driven runtime knobs live in brief-dedup.mjs:readOrchestratorConfig +// — a single source of truth, read at call entry so Railway env flips +// take effect on the next tick. An earlier version exported getter +// helpers here too; they had zero callers and were deleted. + +// An earlier iteration exposed an `__constants` bag so tests could +// assert against tunables in one deepEqual. Once the regex-extraction +// harness was removed, named imports became cleaner — the bag got +// deleted. If you need to assert a constant, import it directly. diff --git a/scripts/lib/brief-dedup-embed.mjs b/scripts/lib/brief-dedup-embed.mjs new file mode 100644 index 000000000..cf11a5a2e --- /dev/null +++ b/scripts/lib/brief-dedup-embed.mjs @@ -0,0 +1,221 @@ +/** + * Pure clustering + entity-veto logic for the embedding dedup path. + * + * This module is intentionally pure and side-effect free: + * - No Redis. + * - No fetch. + * - No env lookups (orchestrator reads env and passes thresholds in). + * + * The orchestrator in brief-dedup.mjs wires these helpers to the + * embedding client and the legacy Jaccard fallback. + */ + +import { cosineSimilarity } from './brief-embedding.mjs'; +import { + COMMON_CAPITALIZED, + LOCATION_GAZETTEER, +} from './entity-gazetteer.mjs'; + +// ── Entity extraction / veto ─────────────────────────────────────────── + +const CAPITALIZED_TOKEN_RE = /^[A-Z][a-zA-Z\-'.]{1,}$/; + +// Longest multi-word entry in the gazetteer (e.g. "ho chi minh city" +// is 4 tokens). Precomputed once; the sliding window in +// extractEntities never tries phrases longer than this, so the cost +// stays O(N * MAX_PHRASE_LEN) rather than O(N²). +const MAX_LOCATION_PHRASE_LEN = (() => { + let max = 1; + for (const entry of LOCATION_GAZETTEER) { + const len = entry.split(/\s+/).length; + if (len > max) max = len; + } + return max; +})(); + +function cleanToken(t) { + return t.replace(/[.,;:!?"')\]]+$/g, '').replace(/^["'([]+/g, ''); +} + +/** + * Pull proper-noun-like entities from a headline and classify them + * against the gazetteer. + * + * Locations are matched as **whole phrases** — single tokens like + * "Tokyo" AND multi-token phrases like "Red Sea", "Strait of Hormuz", + * "New York", "Abu Dhabi" all work. An earlier version tokenized on + * whitespace and only checked single tokens, which silently made + * ~30% of the gazetteer unreachable (bodies of water, regions, + * compound city names). That turned off the veto for a whole class + * of real headlines — hence the sliding-window greedy match below. + * + * Rules: + * 1. Tokenize on whitespace, strip surrounding punctuation. + * 2. Greedy match: at each position, try the longest multi-word + * location phrase first, down to 2 tokens. A phrase matches + * only when its first AND last tokens are capitalized (so + * "the middle east" in lowercase prose doesn't match, but + * "Middle East" in a headline does). Lowercase connectors + * like "of" / "and" may appear between them. + * 3. If no multi-word match: fall back to single-token lookup. + * Capitalized + not in COMMON_CAPITALIZED → Location if in + * gazetteer, Actor otherwise. + * + * Sentence-start tokens are intentionally kept — news headlines + * front-load the anchor entity ("Iran...", "Trump..."). + * + * @param {string} title + * @returns {{ locations: string[], actors: string[] }} + */ +export function extractEntities(title) { + if (typeof title !== 'string' || title.length === 0) { + return { locations: [], actors: [] }; + } + const tokens = title.split(/\s+/).map(cleanToken).filter(Boolean); + + const locations = new Set(); + const actors = new Set(); + let i = 0; + while (i < tokens.length) { + // Greedy longest-phrase scan for multi-word locations. + let matchedLen = 0; + const maxTry = Math.min(MAX_LOCATION_PHRASE_LEN, tokens.length - i); + for (let L = maxTry; L >= 2; L--) { + const first = tokens[i]; + const last = tokens[i + L - 1]; + if (!CAPITALIZED_TOKEN_RE.test(first) || !CAPITALIZED_TOKEN_RE.test(last)) { + continue; + } + const phrase = tokens.slice(i, i + L).join(' ').toLowerCase(); + if (LOCATION_GAZETTEER.has(phrase)) { + locations.add(phrase); + matchedLen = L; + break; + } + } + if (matchedLen > 0) { + i += matchedLen; + continue; + } + // Single-token classification. + const tok = tokens[i]; + if (CAPITALIZED_TOKEN_RE.test(tok)) { + const lower = tok.toLowerCase(); + if (!COMMON_CAPITALIZED.has(lower)) { + if (LOCATION_GAZETTEER.has(lower)) { + locations.add(lower); + } else { + actors.add(lower); + } + } + } + i += 1; + } + return { + locations: [...locations], + actors: [...actors], + }; +} + +/** + * Pairwise merge-veto. + * + * Fires when two titles share at least one location AND each side + * has at least one actor the other doesn't — "same venue, different + * protagonists" (canonical case: "Biden meets Xi in Tokyo" vs + * "Biden meets Putin in Tokyo"). + * + * Empty proper-noun sets on either side → defer to cosine (return false). + * + * @param {string} titleA + * @param {string} titleB + * @returns {boolean} + */ +export function shouldVeto(titleA, titleB) { + const a = extractEntities(titleA); + const b = extractEntities(titleB); + + if (a.actors.length === 0 && b.actors.length === 0) return false; + + const bLocSet = new Set(b.locations); + const sharedLocation = a.locations.some((loc) => bLocSet.has(loc)); + if (!sharedLocation) return false; + + const aActorSet = new Set(a.actors); + const bActorSet = new Set(b.actors); + const aHasUnique = a.actors.some((act) => !bActorSet.has(act)); + const bHasUnique = b.actors.some((act) => !aActorSet.has(act)); + return aHasUnique && bHasUnique; +} + +// ── Complete-link clustering ─────────────────────────────────────────── + +/** + * Greedy first-fit complete-link clustering. + * + * Admission rule: a candidate joins an existing cluster ONLY IF, for + * every member already in that cluster: + * 1. cosine(candidate.embedding, member.embedding) >= cosineThreshold + * 2. vetoFn(candidate, member) === false (if vetoFn provided) + * + * Single-link would admit C into {A,B} as long as C~B clears the bar, + * even if cosine(A,C) is low — the transitive chaining that re- + * created the bridge-pollution failure mode on the Jaccard side. We + * do NOT want that. + * + * Input items MUST be pre-sorted by the caller (the orchestrator in + * brief-dedup.mjs sorts by [currentScore DESC, sha256(title) ASC]). + * Changing input order changes cluster composition; the orchestrator + * owns the determinism contract. + * + * @param {Array<{title:string, embedding:number[]}>} items + * @param {object} opts + * @param {number} opts.cosineThreshold + * @param {((a: {title:string}, b: {title:string}) => boolean) | null} [opts.vetoFn] + * @returns {{ clusters: number[][], vetoFires: number }} + */ +export function completeLinkCluster(items, { cosineThreshold, vetoFn = null }) { + if (!Array.isArray(items)) { + return { clusters: [], vetoFires: 0 }; + } + + const clusters = []; + let vetoFires = 0; + + for (let i = 0; i < items.length; i++) { + const candidate = items[i]; + if (!candidate || !Array.isArray(candidate.embedding)) { + // Defensive: if an item somehow lacks an embedding, it goes in + // its own cluster rather than poisoning the whole batch. + clusters.push([i]); + continue; + } + + let joined = false; + for (const cluster of clusters) { + let admissible = true; + for (const j of cluster) { + const member = items[j]; + const cos = cosineSimilarity(candidate.embedding, member.embedding); + if (cos < cosineThreshold) { + admissible = false; + break; + } + if (vetoFn?.(candidate, member)) { + admissible = false; + vetoFires += 1; + break; + } + } + if (admissible) { + cluster.push(i); + joined = true; + break; + } + } + if (!joined) clusters.push([i]); + } + + return { clusters, vetoFires }; +} + diff --git a/scripts/lib/brief-dedup-jaccard.mjs b/scripts/lib/brief-dedup-jaccard.mjs new file mode 100644 index 000000000..39c08f3c9 --- /dev/null +++ b/scripts/lib/brief-dedup-jaccard.mjs @@ -0,0 +1,114 @@ +/** + * Bag-of-words Jaccard dedup — extracted verbatim from the earlier + * inline implementation in scripts/seed-digest-notifications.mjs so + * the embedding orchestrator can fall back to the exact historical + * behaviour on any failure (provider outage, wall-clock overrun, + * REMOTE_EMBED_ENABLED=0, MODE=jaccard). + * + * DO NOT tune the threshold here. If embedding accuracy is still + * short of the flip criterion at the end of the shadow window, fix + * calibration or the cosine threshold — not this fallback. This + * function's contract is "whatever production did before the + * embedding path landed". + */ + +// ── Stop-word set ──────────────────────────────────────────────────── +// Pruned list of highly-common tokens that dominate Jaccard numerators +// without carrying topical signal. Extracted unchanged from the +// pre-embedding seed-digest-notifications.mjs. +const STOP_WORDS = new Set([ + 'the', 'a', 'an', 'in', 'on', 'at', 'to', 'for', 'of', 'is', 'are', 'was', 'were', + 'has', 'have', 'had', 'be', 'been', 'by', 'from', 'with', 'as', 'it', 'its', + 'says', 'say', 'said', 'according', 'reports', 'report', 'officials', 'official', + 'us', 'new', 'will', 'can', 'could', 'would', 'may', 'also', 'who', 'that', 'this', + 'after', 'about', 'over', 'more', 'up', 'out', 'into', 'than', 'some', 'other', +]); + +/** + * Strip wire-service attribution suffixes like " - Reuters" / + * " | AP News" / " - reuters.com" so headlines from the same event + * are comparable across outlets. + */ +export function stripSourceSuffix(title) { + return title + .replace(/\s*[-–—]\s*[\w\s.]+\.(?:com|org|net|co\.uk)\s*$/i, '') + .replace(/\s*[-–—]\s*(?:Reuters|AP News|BBC|CNN|Al Jazeera|France 24|DW News|PBS NewsHour|CBS News|NBC|ABC|Associated Press|The Guardian|NOS Nieuws|Tagesschau|CNBC|The National)\s*$/i, ''); +} + +/** + * Tokenise a headline into a lower-cased Set of content words, with + * stop-words and 1–2 char tokens dropped. The Set shape is what the + * Jaccard function expects. + */ +export function extractTitleWords(title) { + return new Set( + stripSourceSuffix(title) + .toLowerCase() + .replace(/[^\p{L}\p{N}\s]/gu, '') + .split(/\s+/) + .filter((w) => w.length > 2 && !STOP_WORDS.has(w)), + ); +} + +/** + * Classic Jaccard coefficient on two Sets. |A∩B| / |A∪B|. Returns 0 + * when either Set is empty (no arithmetic surprise on Set(0)). + */ +export function jaccardSimilarity(setA, setB) { + if (setA.size === 0 || setB.size === 0) return 0; + let intersection = 0; + for (const w of setA) if (setB.has(w)) intersection++; + return intersection / (setA.size + setB.size - intersection); +} + +/** + * Representative-selection + mentionCount-sum + mergedHashes contract + * that composeBriefFromDigestStories / sources-population rely on. + * + * Shared helper so the orchestrator's embed path and the Jaccard + * fallback apply identical semantics — drift here silently breaks + * downstream. Accepts an array of story refs (already a single + * cluster) and returns one story object. + * + * @param {Array<{hash:string, currentScore:number, mentionCount:number}>} items + */ +export function materializeCluster(items) { + const sorted = [...items].sort( + (a, b) => b.currentScore - a.currentScore || b.mentionCount - a.mentionCount, + ); + const best = { ...sorted[0] }; + if (sorted.length > 1) { + best.mentionCount = sorted.reduce((sum, s) => sum + s.mentionCount, 0); + } + best.mergedHashes = sorted.map((s) => s.hash); + return best; +} + +/** + * Greedy single-link clustering by Jaccard > 0.55. Preserves the + * representative-selection + mentionCount-sum + mergedHashes contract + * that composeBriefFromDigestStories / sources-population rely on. + * + * Threshold is a hard-coded literal (not env-tunable) on purpose — + * this is the permanent fallback. If the number needs to change, + * the right answer is to flip the caller to MODE=embed with a + * properly-calibrated cosine threshold, not to fiddle with Jaccard. + * + * @param {Array<{title:string, currentScore:number, mentionCount:number, hash:string}>} stories + */ +export function deduplicateStoriesJaccard(stories) { + const clusters = []; + for (const story of stories) { + const words = extractTitleWords(story.title); + let merged = false; + for (const cluster of clusters) { + if (jaccardSimilarity(words, cluster.words) > 0.55) { + cluster.items.push(story); + merged = true; + break; + } + } + if (!merged) clusters.push({ words, items: [story] }); + } + return clusters.map(({ items }) => materializeCluster(items)); +} diff --git a/scripts/lib/brief-dedup.mjs b/scripts/lib/brief-dedup.mjs new file mode 100644 index 000000000..a93083d43 --- /dev/null +++ b/scripts/lib/brief-dedup.mjs @@ -0,0 +1,205 @@ +/** + * Dedup orchestrator — the single entry point the digest cron calls + * to cluster its story list. + * + * Public: deduplicateStories(stories, deps?) returns the same shape + * the earlier inline Jaccard produced: + * [{ ...representativeStoryFields, mentionCount, mergedHashes }, ...] + * + * Env knobs (read at call entry — Railway env flips take effect on + * the next cron tick without a redeploy): + * DIGEST_DEDUP_MODE = 'embed' (default) | 'jaccard' + * (jaccard = instant kill switch) + * DIGEST_DEDUP_ENTITY_VETO_ENABLED = '0' to bypass the actor/ + * location veto; default on + * DIGEST_DEDUP_COSINE_THRESHOLD = float in (0, 1], default 0.60 + * DIGEST_DEDUP_WALL_CLOCK_MS = int ms, default 45000 + * + * Anything non-{embed,jaccard} in MODE = jaccard with a loud warn so + * a typo can't stay hidden. + * + * All-or-nothing fallback: if the embed path throws for any reason + * (provider outage, timeout, missing API key, malformed response), + * the orchestrator falls back to Jaccard for the entire batch and + * emits a warn with `reason=`. The cron NEVER fails + * because embeddings flaked. + */ + +import { createHash } from 'node:crypto'; + +import { + deduplicateStoriesJaccard, + materializeCluster, + stripSourceSuffix, +} from './brief-dedup-jaccard.mjs'; +import { + completeLinkCluster, + shouldVeto, +} from './brief-dedup-embed.mjs'; +import { + embedBatch, + normalizeForEmbedding, +} from './brief-embedding.mjs'; +import { defaultRedisPipeline } from './_upstash-pipeline.mjs'; + +// ── Config resolution (env read at call entry) ───────────────────────── + +/** + * @param {Record} [env] + * @returns {{ + * mode: 'jaccard' | 'embed', + * entityVetoEnabled: boolean, + * cosineThreshold: number, + * wallClockMs: number, + * invalidModeRaw: string | null, + * }} + */ +export function readOrchestratorConfig(env = process.env) { + const modeRaw = (env.DIGEST_DEDUP_MODE ?? '').toLowerCase(); + let mode; + let invalidModeRaw = null; + if (modeRaw === '' || modeRaw === 'embed') { + mode = 'embed'; + } else if (modeRaw === 'jaccard') { + mode = 'jaccard'; + } else { + // Unrecognised value — default to embed (the normal prod path) + // but surface so a DIGEST_DEDUP_MODE=embbed typo is obvious. + mode = 'embed'; + invalidModeRaw = modeRaw; + } + + const cosineRaw = Number.parseFloat(env.DIGEST_DEDUP_COSINE_THRESHOLD ?? ''); + const cosineThreshold = + Number.isFinite(cosineRaw) && cosineRaw > 0 && cosineRaw <= 1 ? cosineRaw : 0.60; + + const wallClockRaw = Number.parseInt(env.DIGEST_DEDUP_WALL_CLOCK_MS ?? '', 10); + const wallClockMs = + Number.isInteger(wallClockRaw) && wallClockRaw > 0 ? wallClockRaw : 45_000; + + return { + mode, + entityVetoEnabled: env.DIGEST_DEDUP_ENTITY_VETO_ENABLED !== '0', + cosineThreshold, + wallClockMs, + invalidModeRaw, + }; +} + +// ── Helpers ──────────────────────────────────────────────────────────── + +function titleHashHex(normalizedTitle) { + return createHash('sha256').update(normalizedTitle).digest('hex'); +} + +// ── Public entry point ───────────────────────────────────────────────── + +/** + * @param {Array<{hash:string, title:string, currentScore:number, mentionCount:number}>} stories + * @param {object} [deps] + * @param {Record} [deps.env] + * @param {typeof embedBatch} [deps.embedBatch] + * @param {typeof deduplicateStoriesJaccard} [deps.jaccard] + * @param {typeof defaultRedisPipeline} [deps.redisPipeline] + * @param {() => number} [deps.now] + * @param {(line: string) => void} [deps.log] + * @param {(line: string) => void} [deps.warn] + */ +export async function deduplicateStories(stories, deps = {}) { + const cfg = readOrchestratorConfig(deps.env ?? process.env); + const jaccard = deps.jaccard ?? deduplicateStoriesJaccard; + const log = deps.log ?? ((line) => console.log(line)); + const warn = deps.warn ?? ((line) => console.warn(line)); + + if (cfg.invalidModeRaw !== null) { + warn( + `[digest] dedup unrecognised DIGEST_DEDUP_MODE=${cfg.invalidModeRaw} — ` + + 'defaulting to embed. Valid values: embed | jaccard.', + ); + } + + if (!Array.isArray(stories) || stories.length === 0) return []; + + // Kill switch: Railway operator sets MODE=jaccard to instantly + // revert to the legacy deduper without a redeploy. + if (cfg.mode === 'jaccard') { + return jaccard(stories); + } + + const embedImpl = deps.embedBatch ?? embedBatch; + const pipelineImpl = deps.redisPipeline ?? defaultRedisPipeline; + const nowImpl = deps.now ?? (() => Date.now()); + const started = nowImpl(); + + try { + // Normalize + deterministic pre-sort so greedy first-fit is + // permutation-invariant (property-tested in the embed test file). + const prepared = stories.map((story, originalIndex) => { + const normalizedTitle = normalizeForEmbedding(story.title); + // `title` here is used as the veto input — must be case- + // preserving (extractEntities looks at capitalised tokens) + // but MUST NOT carry wire-source suffixes (" - Reuters" etc.) + // that would otherwise leak into the actor set and fire the + // veto on two copies of the same event from different outlets. + const vetoTitle = stripSourceSuffix(story.title); + return { + story, + originalIndex, + hash: story.hash, + title: vetoTitle, + normalizedTitle, + titleHashHex: titleHashHex(normalizedTitle), + currentScore: Number(story.currentScore ?? 0), + mentionCount: Number(story.mentionCount ?? 1), + }; + }); + prepared.sort( + (a, b) => + b.currentScore - a.currentScore || + (a.titleHashHex < b.titleHashHex ? -1 : a.titleHashHex > b.titleHashHex ? 1 : 0), + ); + + const embeddings = await embedImpl( + prepared.map((p) => p.normalizedTitle), + { + redisPipeline: pipelineImpl, + wallClockMs: cfg.wallClockMs, + now: nowImpl, + }, + ); + if (!Array.isArray(embeddings) || embeddings.length !== prepared.length) { + throw new Error('embedBatch returned unexpected result'); + } + const items = prepared.map((p, i) => ({ ...p, embedding: embeddings[i] })); + + const vetoFn = cfg.entityVetoEnabled + ? (a, b) => shouldVeto(a.title, b.title) + : null; + const clusterResult = completeLinkCluster(items, { + cosineThreshold: cfg.cosineThreshold, + vetoFn, + }); + + const embedClusters = clusterResult.clusters; + const embedOutput = embedClusters.map((cluster) => + materializeCluster(cluster.map((i) => items[i].story)), + ); + + log( + `[digest] dedup mode=embed stories=${items.length} clusters=${embedClusters.length} ` + + `veto_fires=${clusterResult.vetoFires} ms=${nowImpl() - started} ` + + `threshold=${cfg.cosineThreshold} fallback=false`, + ); + return embedOutput; + } catch (err) { + const reason = + err instanceof Error && typeof err.name === 'string' && err.name !== 'Error' + ? err.name + : 'other'; + const msg = err instanceof Error ? err.message : String(err); + warn( + `[digest] dedup embed path failed, falling back to Jaccard reason=${reason} msg=${msg}`, + ); + return jaccard(stories); + } +} diff --git a/scripts/lib/brief-embedding.mjs b/scripts/lib/brief-embedding.mjs new file mode 100644 index 000000000..09d89a897 --- /dev/null +++ b/scripts/lib/brief-embedding.mjs @@ -0,0 +1,300 @@ +/** + * Embedding client for brief-dedup. + * + * Exports: + * - normalizeForEmbedding(title): the SINGLE function that produces + * both the embedded string and the cache-key input. No aliasing + * possible (plan's "normalization contract"). + * - embedBatch(normalizedTitles, deps): batched, cached, all-or- + * nothing. Throws EmbeddingTimeoutError on wall-clock overrun and + * EmbeddingProviderError on any upstream failure. Never returns a + * partial result. + * + * Contract details: + * - Cache: brief:emb:v1:text-3-small-512:, + * 14-day TTL, JSON array of 512 numbers. + * - Deterministic: same input → same output vectors (cache hits) + * or same OpenRouter call (cache misses). + * - `deps` is for tests — prod callers pass nothing and get the + * real fetch / Upstash / AbortSignal wired in. + */ + +import { createHash } from 'node:crypto'; + +import { + CACHE_KEY_PREFIX, + CACHE_TTL_SECONDS, + EMBED_DIMS, + EMBED_MODEL, + OPENROUTER_EMBEDDINGS_URL, +} from './brief-dedup-consts.mjs'; +import { stripSourceSuffix } from './brief-dedup-jaccard.mjs'; +import { defaultRedisPipeline } from './_upstash-pipeline.mjs'; + +export class EmbeddingProviderError extends Error { + constructor(message, { status, cause } = {}) { + super(message); + this.name = 'EmbeddingProviderError'; + if (status !== undefined) this.status = status; + if (cause !== undefined) this.cause = cause; + } +} + +export class EmbeddingTimeoutError extends Error { + constructor(message = 'Embedding wall-clock budget exceeded') { + super(message); + this.name = 'EmbeddingTimeoutError'; + } +} + +/** + * The ONE normalisation function. Cache-key input = embed-request + * input. Any caller that embeds outside this function will drift. + * + * 1. Strip wire-service suffixes (" - Reuters", " | AP News", etc.) + * via the shared stripSourceSuffix so the outlet allow-list is + * single-sourced with the Jaccard fallback. Adding a new outlet + * updates both paths at once. + * 2. Trim. + * 3. Collapse internal whitespace. + * 4. Lowercase. + */ +export function normalizeForEmbedding(title) { + if (typeof title !== 'string') return ''; + return stripSourceSuffix(title).trim().replace(/\s+/g, ' ').toLowerCase(); +} + +export function cacheKeyFor(normalizedTitle) { + const hash = createHash('sha256').update(normalizedTitle).digest('hex'); + return `${CACHE_KEY_PREFIX}:${hash}`; +} + +// Default (production) deps wiring lives in ./_upstash-pipeline.mjs so +// the orchestrator and the embedding client share one implementation. + +/** + * Look up a set of cache keys via the redis pipeline and return a + * Map of key → vector for the hits. Misses, corrupt cells, pipeline + * failures are all treated as "not in cache" — the caller falls + * through to the API. + * + * Kept as a helper so embedBatch's cognitive complexity stays + * reviewable; there's no other caller. + */ +async function cacheGetBatched(uniqueKeys, pipelineImpl) { + const hits = new Map(); + if (uniqueKeys.length === 0) return hits; + const getResults = await pipelineImpl(uniqueKeys.map((k) => ['GET', k])); + if (!Array.isArray(getResults)) return hits; + for (let i = 0; i < uniqueKeys.length; i++) { + const cell = getResults[i]; + const raw = cell && typeof cell === 'object' && 'result' in cell ? cell.result : null; + if (typeof raw !== 'string') continue; + try { + const parsed = JSON.parse(raw); + if (Array.isArray(parsed) && parsed.length === EMBED_DIMS) { + hits.set(uniqueKeys[i], parsed); + } + } catch { + // Corrupt cache cell: treat as miss. Don't error — next + // successful API call will overwrite. + } + } + return hits; +} + +/** + * Single batched OpenRouter /embeddings call for `missingTitles`. + * Returns a number[N] where N = missingTitles.length. Throws + * EmbeddingTimeoutError on abort/timeout, EmbeddingProviderError on + * any other upstream failure. NEVER returns a partial result. + */ +async function callEmbeddingsApi({ fetchImpl, apiKey, missingTitles, timeoutMs }) { + // Negative / zero remaining-budget means the deadline is already past. + // Bail to the orchestrator's all-or-nothing fallback rather than open a + // doomed HTTP connection that blows the wall-clock cap by the floor. + if (!Number.isFinite(timeoutMs) || timeoutMs <= 0) { + throw new EmbeddingTimeoutError(); + } + let resp; + try { + resp = await fetchImpl(OPENROUTER_EMBEDDINGS_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + 'HTTP-Referer': 'https://worldmonitor.app', + 'X-Title': 'World Monitor', + 'User-Agent': 'worldmonitor-digest/1.0', + }, + body: JSON.stringify({ + model: EMBED_MODEL, + input: missingTitles, + dimensions: EMBED_DIMS, + }), + signal: AbortSignal.timeout(timeoutMs), + }); + } catch (err) { + if (err && (err.name === 'TimeoutError' || err.name === 'AbortError')) { + throw new EmbeddingTimeoutError(); + } + throw new EmbeddingProviderError( + `embedBatch: fetch failed — ${err instanceof Error ? err.message : String(err)}`, + { cause: err }, + ); + } + if (!resp.ok) { + throw new EmbeddingProviderError( + `embedBatch: OpenRouter returned HTTP ${resp.status}`, + { status: resp.status }, + ); + } + let body; + try { + body = await resp.json(); + } catch (err) { + throw new EmbeddingProviderError( + `embedBatch: response JSON parse failed — ${err instanceof Error ? err.message : String(err)}`, + { cause: err }, + ); + } + const data = Array.isArray(body?.data) ? body.data : null; + if (!data || data.length !== missingTitles.length) { + throw new EmbeddingProviderError( + `embedBatch: expected ${missingTitles.length} embeddings, got ${data?.length ?? 'none'}`, + ); + } + // Honour entry.index if the provider re-orders; fall back to i. + const out = new Array(missingTitles.length); + for (let i = 0; i < data.length; i++) { + const entry = data[i]; + const idx = typeof entry?.index === 'number' ? entry.index : i; + const vector = entry?.embedding; + if (!Array.isArray(vector) || vector.length !== EMBED_DIMS) { + throw new EmbeddingProviderError( + `embedBatch: embedding[${idx}] has unexpected length ${vector?.length ?? 'n/a'}`, + ); + } + out[idx] = vector; + } + return out; +} + +/** + * Embed a batch of already-normalised titles with cache look-through. + * + * @param {string[]} normalizedTitles output of normalizeForEmbedding for each title + * @param {object} [deps] + * @param {typeof fetch} [deps.fetch] + * @param {(commands: Array) => Promise | null>} [deps.redisPipeline] + * @param {() => number} [deps.now] + * @param {number} [deps.wallClockMs] + * @param {string} [deps._apiKey] OPENROUTER_API_KEY override (tests only; + * prefixed to discourage accidental spread from user-controlled objects) + * @returns {Promise} one 512-dim vector per input, in order + * + * Throws EmbeddingTimeoutError on wall-clock overrun. + * Throws EmbeddingProviderError on any upstream / parse failure. + * NEVER returns a partial batch — the orchestrator relies on this to + * collapse the entire run to Jaccard on any failure. + */ +export async function embedBatch(normalizedTitles, deps = {}) { + if (!Array.isArray(normalizedTitles)) { + throw new EmbeddingProviderError('embedBatch: normalizedTitles must be an array'); + } + if (normalizedTitles.length === 0) return []; + + // Wrap rather than assign: bare `fetch` captures the current global + // binding at lookup time, so later monkey-patches (instrumentation, + // Edge-runtime shims) don't see the wrapper. See AGENTS.md's + // "fetch.bind(globalThis) is BANNED" rule — same class of bug. + const fetchImpl = deps.fetch ?? ((...args) => globalThis.fetch(...args)); + const pipelineImpl = deps.redisPipeline ?? defaultRedisPipeline; + const nowImpl = deps.now ?? (() => Date.now()); + const wallClockMs = deps.wallClockMs ?? 45_000; + const apiKey = deps._apiKey ?? process.env.OPENROUTER_API_KEY ?? ''; + + if (!apiKey) { + // Provider failure so the orchestrator falls back to Jaccard rather + // than silently embedding with no auth. + throw new EmbeddingProviderError('OPENROUTER_API_KEY not configured'); + } + + const deadline = nowImpl() + wallClockMs; + + // Deduped cache-key table. Same normalised title → same cache cell. + const keyByIndex = normalizedTitles.map((t) => cacheKeyFor(t)); + const uniqueKeys = [...new Set(keyByIndex)]; + + const vectorByKey = await cacheGetBatched(uniqueKeys, pipelineImpl); + if (nowImpl() > deadline) throw new EmbeddingTimeoutError(); + + // Build the miss list, preserving the first normalised title we + // saw for each unique key. + const missingKeys = uniqueKeys.filter((k) => !vectorByKey.has(k)); + if (missingKeys.length > 0) { + const missingTitleByKey = new Map(); + for (let i = 0; i < normalizedTitles.length; i++) { + if (!vectorByKey.has(keyByIndex[i]) && !missingTitleByKey.has(keyByIndex[i])) { + missingTitleByKey.set(keyByIndex[i], normalizedTitles[i]); + } + } + const missingTitles = missingKeys.map((k) => missingTitleByKey.get(k) ?? ''); + const freshVectors = await callEmbeddingsApi({ + fetchImpl, + apiKey, + missingTitles, + timeoutMs: deadline - nowImpl(), + }); + const cacheWrites = []; + for (let i = 0; i < freshVectors.length; i++) { + const key = missingKeys[i]; + vectorByKey.set(key, freshVectors[i]); + cacheWrites.push(['SET', key, JSON.stringify(freshVectors[i]), 'EX', String(CACHE_TTL_SECONDS)]); + } + // Cache writes are best-effort — a failure costs us a re-embed + // on the next run, never a correctness bug. + try { + await pipelineImpl(cacheWrites); + } catch { + // swallow + } + } + + // Map back to input order; duplicated titles share a vector. + const out = new Array(normalizedTitles.length); + for (let i = 0; i < normalizedTitles.length; i++) { + const v = vectorByKey.get(keyByIndex[i]); + if (!v) { + throw new EmbeddingProviderError( + `embedBatch: missing vector for index ${i} after API call`, + ); + } + out[i] = v; + } + return out; +} + +/** + * Cosine similarity for two equal-length vectors. Returns a value + * in [-1, 1]; 1 = identical direction. + * + * Exported so the clusterer and tests share one implementation. + */ +export function cosineSimilarity(a, b) { + if (!Array.isArray(a) || !Array.isArray(b) || a.length !== b.length || a.length === 0) { + return 0; + } + let dot = 0; + let normA = 0; + let normB = 0; + for (let i = 0; i < a.length; i++) { + const ai = a[i]; + const bi = b[i]; + dot += ai * bi; + normA += ai * ai; + normB += bi * bi; + } + if (normA === 0 || normB === 0) return 0; + return dot / (Math.sqrt(normA) * Math.sqrt(normB)); +} diff --git a/scripts/lib/entity-gazetteer.mjs b/scripts/lib/entity-gazetteer.mjs new file mode 100644 index 000000000..347f8451b --- /dev/null +++ b/scripts/lib/entity-gazetteer.mjs @@ -0,0 +1,144 @@ +/** + * Static gazetteer for the dedup entity veto. + * + * Pure data, no network. Loaded once at module init. Small enough to + * audit in a diff; drift is visible in `git log -- scripts/lib/entity-gazetteer.mjs`. + * + * Classification rule used by brief-dedup-embed: + * - token ∈ LOCATION_GAZETTEER → Location class + * - capitalized token ∉ LOCATION_GAZETTEER ∉ COMMON_CAPITALIZED → Actor class + * + * All entries are lowercase; the veto lowercases tokens before lookup. + * + * Known heuristic limitation: coreferential pairs (Iran/Tehran, US/Washington, + * UK/London, Russia/Kremlin) are NOT collapsed — they'd need a name-normaliser + * which is explicitly out of scope for v1. The resulting false-negative + * ("same event in different capital-name vocabulary stays separate") is the + * same class we already tolerate (see plan: "Documented failure classes"). + */ + +// ── Locations: countries, major cities, regions, bodies of water ─────── + +const COUNTRIES = [ + // ISO-3166 short names (lowercase). Not exhaustive — top ~80 by wire + // frequency; add on demand. Full ISO list is cheap to drop in later + // if we see gaps in the calibration data. + 'afghanistan', 'albania', 'algeria', 'argentina', 'armenia', 'australia', + 'austria', 'azerbaijan', 'bahrain', 'bangladesh', 'belarus', 'belgium', + 'bolivia', 'bosnia', 'brazil', 'bulgaria', 'cambodia', 'cameroon', 'canada', + 'chile', 'china', 'colombia', 'congo', 'croatia', 'cuba', 'cyprus', 'czechia', + 'denmark', 'ecuador', 'egypt', 'eritrea', 'estonia', 'ethiopia', 'finland', + 'france', 'georgia', 'germany', 'ghana', 'greece', 'guatemala', 'haiti', + 'honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', + 'ireland', 'israel', 'italy', 'japan', 'jordan', 'kazakhstan', 'kenya', + 'kuwait', 'kyrgyzstan', 'laos', 'latvia', 'lebanon', 'libya', 'lithuania', + 'luxembourg', 'malaysia', 'maldives', 'mali', 'malta', 'mexico', 'moldova', + 'mongolia', 'montenegro', 'morocco', 'mozambique', 'myanmar', 'nepal', + 'netherlands', 'nicaragua', 'niger', 'nigeria', 'norway', 'oman', 'pakistan', + 'panama', 'paraguay', 'peru', 'philippines', 'poland', 'portugal', 'qatar', + 'romania', 'russia', 'rwanda', 'serbia', 'singapore', 'slovakia', 'slovenia', + 'somalia', 'spain', 'sudan', 'sweden', 'switzerland', 'syria', 'taiwan', + 'tajikistan', 'tanzania', 'thailand', 'tunisia', 'turkey', 'turkmenistan', + 'uganda', 'ukraine', 'uruguay', 'uzbekistan', 'venezuela', 'vietnam', + 'yemen', 'zambia', 'zimbabwe', + // Common short forms / alternate spellings used in wire headlines. + 'us', 'usa', 'uk', 'uae', 'drc', 'prc', 'rok', 'dprk', +]; + +const CITIES = [ + // Top ~120 cities by wire-headline frequency. Ordered roughly by + // region for diff-readability, not alphabetical. + // North America + 'washington', 'new york', 'los angeles', 'chicago', 'houston', 'miami', + 'atlanta', 'boston', 'seattle', 'philadelphia', 'detroit', 'dallas', + 'ottawa', 'toronto', 'montreal', 'vancouver', 'mexico city', 'havana', + // Europe + 'london', 'paris', 'berlin', 'brussels', 'amsterdam', 'rome', 'madrid', + 'lisbon', 'dublin', 'vienna', 'prague', 'warsaw', 'budapest', 'athens', + 'stockholm', 'copenhagen', 'oslo', 'helsinki', 'zurich', 'geneva', 'bern', + 'milan', 'naples', 'barcelona', 'munich', 'frankfurt', 'hamburg', + 'edinburgh', 'glasgow', 'manchester', 'belfast', + // Eastern Europe / Russia + 'moscow', 'st petersburg', 'kyiv', 'kiev', 'odesa', 'odessa', 'kharkiv', + 'lviv', 'mariupol', 'donetsk', 'luhansk', 'minsk', 'chisinau', 'tbilisi', + 'yerevan', 'baku', 'bucharest', 'sofia', 'belgrade', 'zagreb', 'sarajevo', + 'pristina', 'skopje', 'tirana', 'vilnius', 'riga', 'tallinn', + // Middle East / North Africa + 'tehran', 'baghdad', 'damascus', 'beirut', 'amman', 'riyadh', 'doha', + 'dubai', 'abu dhabi', 'kuwait city', 'muscat', 'manama', 'sanaa', 'aden', + 'jerusalem', 'tel aviv', 'gaza', 'ramallah', 'cairo', 'alexandria', + 'tripoli', 'benghazi', 'tunis', 'algiers', 'casablanca', 'rabat', + 'ankara', 'istanbul', 'izmir', + // Africa (sub-Saharan) + 'lagos', 'abuja', 'accra', 'nairobi', 'addis ababa', 'khartoum', 'juba', + 'kampala', 'kigali', 'dakar', 'bamako', 'ouagadougou', 'niamey', + 'johannesburg', 'cape town', 'pretoria', 'harare', 'lusaka', 'maputo', + 'kinshasa', 'brazzaville', 'luanda', 'antananarivo', 'mogadishu', + // Asia + 'beijing', 'shanghai', 'hong kong', 'taipei', 'tokyo', 'osaka', 'kyoto', + 'seoul', 'pyongyang', 'delhi', 'new delhi', 'mumbai', 'kolkata', 'chennai', + 'bengaluru', 'hyderabad', 'islamabad', 'karachi', 'lahore', 'kabul', + 'dhaka', 'colombo', 'kathmandu', 'rangoon', 'yangon', 'naypyidaw', + 'bangkok', 'phnom penh', 'hanoi', 'ho chi minh city', 'saigon', + 'vientiane', 'kuala lumpur', 'jakarta', 'manila', 'singapore', + // Oceania + 'canberra', 'sydney', 'melbourne', 'auckland', 'wellington', + // Latin America (south) + 'brasilia', 'rio de janeiro', 'sao paulo', 'buenos aires', 'santiago', + 'lima', 'bogota', 'caracas', 'quito', 'la paz', 'asuncion', 'montevideo', +]; + +const REGIONS = [ + // Geopolitical / geographic regions that travel as wire headline entities. + 'middle east', 'north africa', 'sub-saharan africa', 'horn of africa', + 'west bank', 'gaza strip', 'sinai', + 'balkans', 'caucasus', 'central asia', 'south asia', 'southeast asia', + 'east asia', 'latin america', 'caribbean', 'scandinavia', 'baltics', + 'eu', 'nato', 'asean', 'gulf', + // Bodies of water / straits that name-drive news events. + 'hormuz', 'strait of hormuz', 'bab el-mandeb', 'red sea', 'black sea', + 'south china sea', 'east china sea', 'baltic sea', 'mediterranean', + 'persian gulf', 'arabian gulf', 'gulf of aden', 'gulf of oman', + 'taiwan strait', 'english channel', + // Commonly-named disputed / conflict zones. + 'donbas', 'donbass', 'crimea', 'kashmir', 'tibet', 'xinjiang', + 'nagorno-karabakh', 'transnistria', +]; + +// Country names are INTENTIONALLY NOT in LOCATION_GAZETTEER — in news +// headlines the country is usually the political actor ("Iran closes +// Hormuz") not the venue, so the veto classifies country tokens as +// actors. The COUNTRIES array is still exported below so a caller +// that needs the list (e.g. a future NER pass) can consume it. +export const LOCATION_GAZETTEER = new Set([ + ...CITIES, + ...REGIONS, +]); + +export const COUNTRY_NAMES = new Set(COUNTRIES); + +// ── Common capitalized English words that are NOT entities ───────────── +// Sentence-initial capitalisation and a few idiom-openers that the +// veto would otherwise pick up as proper nouns. + +export const COMMON_CAPITALIZED = new Set([ + // Articles / determiners / pronouns + 'the', 'a', 'an', 'this', 'that', 'these', 'those', 'some', 'any', + 'all', 'every', 'each', 'no', 'none', 'other', 'another', + 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'my', 'your', 'his', + 'her', 'its', 'our', 'their', + // Prepositions / conjunctions / common sentence starters + 'in', 'on', 'at', 'by', 'to', 'for', 'of', 'with', 'from', 'up', + 'down', 'out', 'into', 'onto', 'over', 'under', 'after', 'before', + 'during', 'since', 'until', 'as', 'than', 'then', 'so', 'but', 'and', + 'or', 'nor', 'yet', 'if', 'because', 'while', 'when', 'where', + 'why', 'how', 'who', 'what', 'which', 'whose', + // Common news-headline sentence starters + 'breaking', 'report', 'reports', 'new', 'latest', 'today', 'yesterday', + 'now', 'live', 'updates', 'update', 'analysis', 'opinion', 'exclusive', + 'video', 'watch', 'listen', 'read', + // Auxiliary / modal verbs that may lead a headline + 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', + 'had', 'do', 'does', 'did', 'can', 'could', 'will', 'would', 'shall', + 'should', 'may', 'might', 'must', +]); diff --git a/scripts/seed-digest-notifications.mjs b/scripts/seed-digest-notifications.mjs index d33fe6cfe..9fc617eaf 100644 --- a/scripts/seed-digest-notifications.mjs +++ b/scripts/seed-digest-notifications.mjs @@ -39,6 +39,8 @@ import { import { enrichBriefEnvelopeWithLLM } from './lib/brief-llm.mjs'; import { assertBriefEnvelope } from '../server/_shared/brief-render.js'; import { signBriefUrl, BriefUrlError } from './lib/brief-url-sign.mjs'; +import { deduplicateStories } from './lib/brief-dedup.mjs'; +import { stripSourceSuffix } from './lib/brief-dedup-jaccard.mjs'; // ── Config ──────────────────────────────────────────────────────────────────── @@ -224,66 +226,15 @@ function matchesSensitivity(ruleSensitivity, severity) { return severity === 'critical'; } -// ── Fuzzy deduplication ────────────────────────────────────────────────────── - -const STOP_WORDS = new Set([ - 'the','a','an','in','on','at','to','for','of','is','are','was','were', - 'has','have','had','be','been','by','from','with','as','it','its', - 'says','say','said','according','reports','report','officials','official', - 'us','new','will','can','could','would','may','also','who','that','this', - 'after','about','over','more','up','out','into','than','some','other', -]); - -function stripSourceSuffix(title) { - return title - .replace(/\s*[-–—]\s*[\w\s.]+\.(?:com|org|net|co\.uk)\s*$/i, '') - .replace(/\s*[-–—]\s*(?:Reuters|AP News|BBC|CNN|Al Jazeera|France 24|DW News|PBS NewsHour|CBS News|NBC|ABC|Associated Press|The Guardian|NOS Nieuws|Tagesschau|CNBC|The National)\s*$/i, ''); -} - -function extractTitleWords(title) { - return new Set( - stripSourceSuffix(title) - .toLowerCase() - .replace(/[^\p{L}\p{N}\s]/gu, '') - .split(/\s+/) - .filter(w => w.length > 2 && !STOP_WORDS.has(w)), - ); -} - -function jaccardSimilarity(setA, setB) { - if (setA.size === 0 || setB.size === 0) return 0; - let intersection = 0; - for (const w of setA) if (setB.has(w)) intersection++; - return intersection / (setA.size + setB.size - intersection); -} - -function deduplicateStories(stories) { - const clusters = []; - for (const story of stories) { - const words = extractTitleWords(story.title); - let merged = false; - for (const cluster of clusters) { - if (jaccardSimilarity(words, cluster.words) > 0.55) { - cluster.items.push(story); - merged = true; - break; - } - } - if (!merged) clusters.push({ words, items: [story] }); - } - return clusters.map(({ items }) => { - items.sort((a, b) => b.currentScore - a.currentScore || b.mentionCount - a.mentionCount); - const best = { ...items[0] }; - if (items.length > 1) { - best.mentionCount = items.reduce((sum, s) => sum + s.mentionCount, 0); - } - best.mergedHashes = items.map(s => s.hash); - return best; - }); -} - // ── Digest content ──────────────────────────────────────────────────────────── +// Dedup lives in scripts/lib/brief-dedup.mjs (orchestrator) with the +// legacy Jaccard in scripts/lib/brief-dedup-jaccard.mjs. The orchestrator +// reads DIGEST_DEDUP_MODE at call time — default 'jaccard' keeps +// behaviour identical to pre-embedding production. stripSourceSuffix +// is imported from the Jaccard module so the text/HTML formatters +// below keep their current per-story title cleanup. + async function buildDigest(rule, windowStartMs) { const variant = rule.variant ?? 'full'; const lang = rule.lang ?? 'en'; @@ -324,7 +275,7 @@ async function buildDigest(rule, windowStartMs) { if (stories.length === 0) return null; stories.sort((a, b) => b.currentScore - a.currentScore); - const deduped = deduplicateStories(stories); + const deduped = await deduplicateStories(stories); const top = deduped.slice(0, DIGEST_MAX_ITEMS); const allSourceCmds = []; diff --git a/tests/brief-dedup-embedding.test.mjs b/tests/brief-dedup-embedding.test.mjs new file mode 100644 index 000000000..ad0a24da6 --- /dev/null +++ b/tests/brief-dedup-embedding.test.mjs @@ -0,0 +1,556 @@ +/** + * Embedding-dedup integration tests against a deterministic stub + * embedder — no network. Covers the 9 scenarios enumerated in + * docs/plans/2026-04-19-001-feat-embedding-based-story-dedup-plan.md: + * + * 1. Happy path + * 2. Cold-cache timeout → Jaccard fallback + * 3. Provider outage → Jaccard fallback + * 4. Shadow mode + * 5. Entity veto fires + * 6. Complete-link non-chaining + * 7. Cluster-level fixture + * 8. Remote-embed-disabled bypass + * 9. Permutation-invariance property test + * + * The live-embedder golden-pair validator lives in a separate nightly + * CI job (.github/workflows/dedup-golden-pairs.yml) — it's NOT run + * from the brief cron and NOT in this file. + * + * Run: node --test tests/brief-dedup-embedding.test.mjs + */ + +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; + +import { deduplicateStories } from '../scripts/lib/brief-dedup.mjs'; +import { deduplicateStoriesJaccard } from '../scripts/lib/brief-dedup-jaccard.mjs'; +import { + EmbeddingProviderError, + EmbeddingTimeoutError, + cosineSimilarity, + normalizeForEmbedding, +} from '../scripts/lib/brief-embedding.mjs'; +import { + completeLinkCluster, + extractEntities, + shouldVeto, +} from '../scripts/lib/brief-dedup-embed.mjs'; + +// ── Fixture helpers ─────────────────────────────────────────────────────────── + +function story(title, score = 10, mentions = 1, hash = undefined) { + return { + title, + currentScore: score, + mentionCount: mentions, + sources: [], + severity: 'critical', + hash: hash ?? `h-${title.slice(0, 16).replace(/\W+/g, '-')}`, + }; +} + +// Orchestrator env that turns on the embed path without shadow-archive +// dependencies. +const EMBED_MODE = { DIGEST_DEDUP_MODE: 'embed', DIGEST_DEDUP_COSINE_THRESHOLD: '0.5' }; + +/** + * Build a stub embedBatch that looks up each normalised title in a + * provided map. Captures call count for assertion-based tests. Any + * title missing from the map is embedded as the zero vector — which + * will fail cosine similarity > 0, so the test will notice. + */ +function stubEmbedder(vectorByNormalizedTitle) { + const calls = []; + async function embedBatch(normalizedTitles) { + calls.push(normalizedTitles.slice()); + return normalizedTitles.map((t) => { + const v = vectorByNormalizedTitle.get(t); + if (!v) throw new Error(`stubEmbedder: no vector for "${t}"`); + return v; + }); + } + return { embedBatch, calls }; +} + +function noopPipeline() { + return null; +} + +/** + * Captures log lines emitted by the orchestrator so tests can assert + * on observability output without swallowing real console output. + */ +function lineCollector() { + const lines = []; + return { + lines, + log: (line) => lines.push({ level: 'log', line }), + warn: (line) => lines.push({ level: 'warn', line }), + }; +} + +// ── Scenario 1 — Happy path ─────────────────────────────────────────────────── + +describe('Scenario 1 — happy path: embed clusters near-duplicates', () => { + it('merges two near-duplicate stories into one cluster when embed mode is on', async () => { + const titles = [ + 'iran closes strait of hormuz', + 'iran shuts strait of hormuz', + 'myanmar coup leader elected president', + ]; + // Near-parallel vectors for 0/1 (cos ≈ 0.95), orthogonal for 2. + const vecByTitle = new Map([ + [titles[0], [1, 0, 0]], + [titles[1], [0.95, Math.sqrt(1 - 0.95 * 0.95), 0]], + [titles[2], [0, 0, 1]], + ]); + const embedder = stubEmbedder(vecByTitle); + const collector = lineCollector(); + + const stories = [ + story('Iran closes Strait of Hormuz', 90, 1, 'h0'), + story('Iran shuts Strait of Hormuz', 85, 1, 'h1'), + story('Myanmar coup leader elected president', 80, 1, 'h2'), + ]; + const out = await deduplicateStories(stories, { + env: EMBED_MODE, + embedBatch: embedder.embedBatch, + redisPipeline: noopPipeline, + ...collector, + }); + + assert.equal(embedder.calls.length, 1, 'exactly one batched embedBatch call'); + assert.equal(out.length, 2, 'two clusters (merged pair + singleton)'); + + const merged = out.find((c) => c.mergedHashes.length === 2); + assert.ok(merged, 'one cluster contains the two Hormuz variants'); + assert.deepEqual(new Set(merged.mergedHashes), new Set(['h0', 'h1'])); + assert.equal(merged.mentionCount, 2); + + const singleton = out.find((c) => c.mergedHashes.length === 1); + assert.ok(singleton); + assert.equal(singleton.mergedHashes[0], 'h2'); + + // Structured log line emitted. + assert.ok(collector.lines.some((l) => l.line.includes('mode=embed'))); + assert.ok(collector.lines.some((l) => l.line.includes('fallback=false'))); + }); +}); + +// ── Scenario 2 — timeout ────────────────────────────────────────────────────── + +describe('Scenario 2 — cold-cache timeout collapses to Jaccard', () => { + it('EmbeddingTimeoutError falls back to Jaccard for the whole batch', async () => { + const throwingEmbedder = async () => { + throw new EmbeddingTimeoutError(); + }; + const stories = [ + story('Iran closes Strait of Hormuz', 90, 1, 'h0'), + story('Iran shuts Strait of Hormuz', 85, 1, 'h1'), + ]; + const collector = lineCollector(); + + const out = await deduplicateStories(stories, { + env: EMBED_MODE, + embedBatch: throwingEmbedder, + redisPipeline: noopPipeline, + ...collector, + }); + + // Jaccard output is the ground truth under fallback — deep-equal + // cluster shape, not just length, so a regression that preserves + // count but changes membership or representative can't slip. + const expected = deduplicateStoriesJaccard(stories); + assert.equal(out.length, expected.length); + for (let i = 0; i < out.length; i++) { + assert.equal(out[i].hash, expected[i].hash); + assert.deepEqual(out[i].mergedHashes, expected[i].mergedHashes); + assert.equal(out[i].mentionCount, expected[i].mentionCount); + } + // Fallback warn line must carry a filterable reason= field. + const fallbackWarn = collector.lines.find( + (l) => l.level === 'warn' && l.line.includes('falling back to Jaccard'), + ); + assert.ok(fallbackWarn, 'warn line on fallback'); + assert.match(fallbackWarn.line, /reason=EmbeddingTimeoutError\b/); + }); +}); + +// ── Scenario 3 — provider outage ────────────────────────────────────────────── + +describe('Scenario 3 — provider outage collapses to Jaccard', () => { + it('EmbeddingProviderError (HTTP 503) falls back', async () => { + const throwingEmbedder = async () => { + throw new EmbeddingProviderError('OpenRouter returned HTTP 503', { status: 503 }); + }; + const stories = [story('a', 10, 1, 'a1'), story('b', 10, 1, 'b1')]; + const collector = lineCollector(); + + const out = await deduplicateStories(stories, { + env: EMBED_MODE, + embedBatch: throwingEmbedder, + redisPipeline: noopPipeline, + ...collector, + }); + + const expected = deduplicateStoriesJaccard(stories); + assert.equal(out.length, expected.length); + for (let i = 0; i < out.length; i++) { + assert.equal(out[i].hash, expected[i].hash); + assert.deepEqual(out[i].mergedHashes, expected[i].mergedHashes); + assert.equal(out[i].mentionCount, expected[i].mentionCount); + } + const fallbackWarn = collector.lines.find((l) => l.level === 'warn'); + assert.ok(fallbackWarn, 'warn line on fallback'); + assert.match(fallbackWarn.line, /reason=EmbeddingProviderError\b/); + }); +}); + +// ── Scenario 4 / 8 — shadow mode and remote-embed kill switch were +// removed when the rollout was simplified to "ship embed directly". +// MODE=jaccard is the only rollback path; covered in +// tests/brief-dedup-jaccard.test.mjs. + +// ── Scenario 5 — entity veto ────────────────────────────────────────────────── + +describe('Scenario 5 — entity veto blocks same-location, different-actor merges', () => { + it('shouldVeto fires on canonical Biden/Xi vs Biden/Putin case', () => { + assert.equal( + shouldVeto('Biden meets Xi in Tokyo', 'Biden meets Putin in Tokyo'), + true, + ); + }); + + it('defers to cosine on Iran/Tehran + Hormuz (documented heuristic limitation)', () => { + // Capital-country coreference is not resolved in v1. The plan's + // original spec claimed the veto would fire here via "unique + // actors {Iran} vs {Tehran}", but the classification rule is: + // - Iran → actor (country, not in gazetteer) + // - Tehran → location (capital city IS in the gazetteer) + // - Hormuz → location + // With the two anchors on different sides of the actor/location + // boundary, there's no symmetric "unique actor on each side" + // signal and the veto can't conclude. Behaviour falls through + // to cosine — which on real text may merge (false positive) + // or split (false negative) depending on wording. Accepted for + // v1 as the documented limitation; a name-normaliser is the + // future fix. + assert.equal( + shouldVeto('Iran closes Hormuz', 'Tehran shuts Hormuz'), + false, + ); + }); + + it('shouldVeto does NOT fire when actors fully match', () => { + assert.equal(shouldVeto('Trump meets Xi', 'Trump Xi summit'), false); + }); + + it('shouldVeto defers to cosine when proper-noun sets are empty on both sides', () => { + assert.equal(shouldVeto('the meeting concludes', 'the meeting ends'), false); + }); + + it('veto blocks cluster admission end-to-end', async () => { + // High cosine (0.99) but disagreeing actors → veto fires and + // the stories stay in separate clusters. + const stories = [ + story('Biden meets Xi in Tokyo', 90, 1, 'xi'), + story('Biden meets Putin in Tokyo', 85, 1, 'putin'), + ]; + const vecByTitle = new Map([ + [normalizeForEmbedding(stories[0].title), [1, 0, 0]], + [normalizeForEmbedding(stories[1].title), [0.99, Math.sqrt(1 - 0.99 * 0.99), 0]], + ]); + const embedder = stubEmbedder(vecByTitle); + + const out = await deduplicateStories(stories, { + env: EMBED_MODE, + embedBatch: embedder.embedBatch, + redisPipeline: noopPipeline, + }); + + assert.equal(out.length, 2, 'veto keeps the two titles in separate clusters'); + }); + + it('DIGEST_DEDUP_ENTITY_VETO_ENABLED=0 disables the veto at runtime', async () => { + const stories = [ + story('Biden meets Xi in Tokyo', 90, 1, 'xi'), + story('Biden meets Putin in Tokyo', 85, 1, 'putin'), + ]; + const vecByTitle = new Map([ + [normalizeForEmbedding(stories[0].title), [1, 0, 0]], + [normalizeForEmbedding(stories[1].title), [0.99, Math.sqrt(1 - 0.99 * 0.99), 0]], + ]); + const embedder = stubEmbedder(vecByTitle); + + const out = await deduplicateStories(stories, { + env: { ...EMBED_MODE, DIGEST_DEDUP_ENTITY_VETO_ENABLED: '0' }, + embedBatch: embedder.embedBatch, + redisPipeline: noopPipeline, + }); + + assert.equal(out.length, 1, 'without the veto, cosine alone merges the two titles'); + }); +}); + +// ── Scenario 6 — complete-link non-chaining ─────────────────────────────────── + +describe('Scenario 6 — complete-link blocks transitive chaining', () => { + it('A~B=0.65, B~C=0.65, A~C=0.30 → {A,B} and {C}, NOT {A,B,C}', () => { + // Constructed so pairwise cosines are exact (see plan for derivation). + const a = [1, 0, 0, 0]; + const b = [0.65, Math.sqrt(1 - 0.65 * 0.65), 0, 0]; + // c must satisfy: a·c = 0.30, b·c = 0.65, |c| = 1. + // Solving: cx=0.30; cy=(0.65 - 0.65*0.30)/sqrt(1-0.4225) = 0.4550/0.7599 = 0.599; + // cz = sqrt(1 - 0.09 - 0.359) = sqrt(0.551) = 0.7423 + const cx = 0.3; + const cy = (0.65 - 0.65 * 0.3) / Math.sqrt(1 - 0.65 * 0.65); + const cz = Math.sqrt(1 - cx * cx - cy * cy); + const c = [cx, cy, cz, 0]; + + // Sanity-check the construction so a regression in the derivation + // can't mask a real bug. + assert.ok(Math.abs(cosineSimilarity(a, b) - 0.65) < 1e-6); + assert.ok(Math.abs(cosineSimilarity(b, c) - 0.65) < 1e-6); + assert.ok(Math.abs(cosineSimilarity(a, c) - 0.3) < 1e-6); + + const items = [ + { title: 'A', embedding: a }, + { title: 'B', embedding: b }, + { title: 'C', embedding: c }, + ]; + const { clusters } = completeLinkCluster(items, { cosineThreshold: 0.5 }); + + // {A,B} should be one cluster, {C} separate — not {A,B,C}. + assert.equal(clusters.length, 2); + const abCluster = clusters.find((cl) => cl.length === 2); + const cCluster = clusters.find((cl) => cl.length === 1); + assert.ok(abCluster && cCluster, 'two clusters: the A+B pair and the C singleton'); + assert.ok(abCluster.includes(0) && abCluster.includes(1)); + assert.ok(cCluster.includes(2)); + }); +}); + +// ── Scenario 7 — cluster-level fixture ──────────────────────────────────────── + +describe('Scenario 7 — cluster-level fixture', () => { + it('10-story fixture clusters into the expected shape', async () => { + // Four real wire-headline clusters plus two singletons = 6 clusters. + // Vectors are hand-crafted so only intended-cluster pairs clear 0.5. + const e1 = [1, 0, 0, 0, 0, 0]; + const e2 = [0, 1, 0, 0, 0, 0]; + const e3 = [0, 0, 1, 0, 0, 0]; + const e4 = [0, 0, 0, 1, 0, 0]; + const e5 = [0, 0, 0, 0, 1, 0]; + const e6 = [0, 0, 0, 0, 0, 1]; + + function near(axis, epsilon = 0.03) { + // Same-direction vector at cosine > 0.99 to `axis` basis. + const out = axis.slice(); + return out.map((v) => v * (1 - epsilon)); + } + + const fixtures = [ + { title: 'Iran closes Strait of Hormuz', hash: 'a1', v: e1, expectCluster: 'A' }, + { title: 'Iran shuts Strait of Hormuz', hash: 'a2', v: near(e1), expectCluster: 'A' }, + { title: 'US fighter jet downed over Iran', hash: 'b1', v: e2, expectCluster: 'B' }, + { title: 'American aircraft shot down in Iran', hash: 'b2', v: near(e2), expectCluster: 'B' }, + { title: 'Myanmar coup leader sworn in', hash: 'c1', v: e3, expectCluster: 'C' }, + { title: 'Myanmar junta chief takes office', hash: 'c2', v: near(e3), expectCluster: 'C' }, + { title: 'Brent crude tops $140', hash: 'd1', v: e4, expectCluster: 'D' }, + { title: 'Oil price surges past $140', hash: 'd2', v: near(e4), expectCluster: 'D' }, + { title: 'Singleton 1', hash: 's1', v: e5, expectCluster: 'E' }, + { title: 'Singleton 2', hash: 's2', v: e6, expectCluster: 'F' }, + ]; + const stories = fixtures.map((f) => + story(f.title, 100 - fixtures.indexOf(f), 1, f.hash), + ); + const vecByTitle = new Map( + fixtures.map((f) => [normalizeForEmbedding(f.title), f.v]), + ); + const embedder = stubEmbedder(vecByTitle); + + const out = await deduplicateStories(stories, { + env: EMBED_MODE, + embedBatch: embedder.embedBatch, + redisPipeline: noopPipeline, + }); + + // 6 clusters total: 4 pairs + 2 singletons. + assert.equal(out.length, 6); + + // Each expected pair's hashes should land in the same cluster. + const pairs = [['a1', 'a2'], ['b1', 'b2'], ['c1', 'c2'], ['d1', 'd2']]; + for (const [x, y] of pairs) { + const cluster = out.find((c) => c.mergedHashes.includes(x)); + assert.ok(cluster?.mergedHashes.includes(y), `${x} and ${y} should cluster together`); + } + // Singletons stay alone. + const s1 = out.find((c) => c.mergedHashes.includes('s1')); + const s2 = out.find((c) => c.mergedHashes.includes('s2')); + assert.equal(s1.mergedHashes.length, 1); + assert.equal(s2.mergedHashes.length, 1); + }); +}); + +// ── Scenario 9 — permutation-invariance property test ──────────────────────── + +describe('Scenario 9 — permutation-invariance', () => { + it('10 random input orders of the same 15-story set produce identical clusters', async () => { + // Construct 15 stories in 5 clusters of 3. Each cluster shares a + // near-unit basis vector; clusters are pairwise orthogonal. + const N_CLUSTERS = 5; + const PER_CLUSTER = 3; + const fixtures = []; + for (let c = 0; c < N_CLUSTERS; c++) { + const basis = Array.from({ length: N_CLUSTERS }, (_, i) => (i === c ? 1 : 0)); + for (let k = 0; k < PER_CLUSTER; k++) { + const jitter = basis.map((v, i) => (i === c ? v - k * 0.002 : v)); + fixtures.push({ + title: `Cluster ${c} item ${k}`, + hash: `c${c}-k${k}`, + v: jitter, + score: 100 - (c * PER_CLUSTER + k), + }); + } + } + const stories = fixtures.map((f) => story(f.title, f.score, 1, f.hash)); + const vecByTitle = new Map( + fixtures.map((f) => [normalizeForEmbedding(f.title), f.v]), + ); + + function sigFor(out) { + // Canonical representation: each cluster as a sorted hash list, + // overall list sorted. + return out.map((c) => [...c.mergedHashes].sort()).map((l) => l.join(',')).sort().join('|'); + } + + // Baseline run on the canonical input order. + const baseline = await deduplicateStories(stories, { + env: EMBED_MODE, + embedBatch: stubEmbedder(vecByTitle).embedBatch, + redisPipeline: noopPipeline, + }); + const baselineSig = sigFor(baseline); + + // Ten random permutations — each must produce the IDENTICAL cluster set. + let seed = 42; + function rand() { + seed = (seed * 1103515245 + 12345) & 0x7fffffff; + return seed / 0x7fffffff; + } + for (let run = 0; run < 10; run++) { + const shuffled = [...stories]; + for (let i = shuffled.length - 1; i > 0; i--) { + const j = Math.floor(rand() * (i + 1)); + [shuffled[i], shuffled[j]] = [shuffled[j], shuffled[i]]; + } + const out = await deduplicateStories(shuffled, { + env: EMBED_MODE, + embedBatch: stubEmbedder(vecByTitle).embedBatch, + redisPipeline: noopPipeline, + }); + assert.equal( + sigFor(out), + baselineSig, + `permutation ${run} produced a different cluster set`, + ); + } + }); +}); + +// ── Entity extraction unit tests ────────────────────────────────────────────── + +describe('extractEntities', () => { + it('classifies country name as actor, strait as location', () => { + // Per plan intent: countries are geopolitical actors ("Iran does X"), + // physical geography is the venue. + const { locations, actors } = extractEntities('Iran closes Strait of Hormuz'); + assert.ok(actors.includes('iran')); + // Multi-word match finds "strait of hormuz", NOT the single-token + // fallback "hormuz" — the full phrase is in the gazetteer. + assert.ok( + locations.includes('strait of hormuz') || locations.includes('hormuz'), + 'hormuz location must be detected (as phrase or single token)', + ); + assert.ok(!locations.includes('iran')); + }); + + it('classifies city as location, person as actor', () => { + const { locations, actors } = extractEntities('Biden meets Xi in Tokyo'); + assert.ok(locations.includes('tokyo')); + assert.ok(actors.includes('biden')); + assert.ok(actors.includes('xi')); + }); + + it('skips common capitalized sentence-starters', () => { + const { locations, actors } = extractEntities('The meeting begins'); + assert.equal(locations.length, 0); + assert.equal(actors.length, 0); + }); + + it('keeps sentence-start proper nouns', () => { + const { actors } = extractEntities('Trump to visit Japan'); + assert.ok(actors.includes('trump')); + // Japan is a country → actor, not location + assert.ok(actors.includes('japan')); + }); + + // Regression: multi-word gazetteer entries are matched as whole + // phrases. An earlier implementation split on whitespace and only + // checked single tokens, so "Red Sea", "South China Sea", "New York", + // etc. silently fell through to the actor bucket and disabled the + // veto for a whole class of real headlines. + it('matches multi-word location: Red Sea', () => { + const { locations, actors } = extractEntities('Houthis strike ship in Red Sea'); + assert.ok(locations.includes('red sea')); + assert.ok(!actors.includes('red')); + assert.ok(!actors.includes('sea')); + assert.ok(actors.includes('houthis')); + }); + + it('matches multi-word location: South China Sea', () => { + const { locations } = extractEntities('Tensions flare in South China Sea'); + assert.ok(locations.includes('south china sea')); + }); + + it('matches multi-word location with lowercase connector: Strait of Hormuz', () => { + const { locations } = extractEntities('Iran closes Strait of Hormuz'); + assert.ok(locations.includes('strait of hormuz')); + }); + + it('matches multi-word city: Abu Dhabi', () => { + const { locations } = extractEntities('Summit held in Abu Dhabi'); + assert.ok(locations.includes('abu dhabi')); + }); + + it('matches multi-word city: New York', () => { + const { locations } = extractEntities('UN meeting in New York'); + assert.ok(locations.includes('new york')); + }); + + // Veto end-to-end: reproducer from the P1 finding. Two Red-Sea + // headlines share a location and disagree on the actor — veto + // MUST fire (otherwise the main anti-overmerge guard is off for + // bodies-of-water / region headlines). + it('shouldVeto: Houthis vs US on Red Sea — location phrase match fires the veto', () => { + assert.equal( + shouldVeto('Houthis strike ship in Red Sea', 'US escorts convoy in Red Sea'), + true, + ); + }); +}); + +// ── Cosine helper ───────────────────────────────────────────────────────────── + +describe('cosineSimilarity', () => { + it('returns 1 for identical vectors', () => { + assert.equal(cosineSimilarity([1, 2, 3], [1, 2, 3]), 1); + }); + it('returns 0 for orthogonal vectors', () => { + assert.equal(cosineSimilarity([1, 0], [0, 1]), 0); + }); + it('handles a zero vector without throwing', () => { + assert.equal(cosineSimilarity([0, 0], [1, 1]), 0); + }); +}); diff --git a/tests/brief-dedup-jaccard.test.mjs b/tests/brief-dedup-jaccard.test.mjs new file mode 100644 index 000000000..1f5644d36 --- /dev/null +++ b/tests/brief-dedup-jaccard.test.mjs @@ -0,0 +1,243 @@ +/** + * Regression tests for the permanent Jaccard fallback path. + * + * The earlier harness parsed scripts/seed-digest-notifications.mjs + * with regexes to extract the dedup helpers into a Function() sandbox. + * Now that the logic lives in its own module we import directly — no + * regex fragility, no drift when the seed script is refactored. + * + * Run: node --test tests/brief-dedup-jaccard.test.mjs + */ + +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; + +import { + deduplicateStoriesJaccard, + extractTitleWords, + jaccardSimilarity, + stripSourceSuffix, +} from '../scripts/lib/brief-dedup-jaccard.mjs'; +import { + CACHE_KEY_PREFIX, + JACCARD_MERGE_THRESHOLD, +} from '../scripts/lib/brief-dedup-consts.mjs'; + +// ── Constants ───────────────────────────────────────────────────────────────── + +describe('brief-dedup-consts', () => { + it('exposes the Jaccard merge threshold as a pure constant', () => { + assert.equal(JACCARD_MERGE_THRESHOLD, 0.55); + }); + + it('embedding cache key prefix is namespaced + versioned', () => { + // Bump when the embed model or dimension changes — silent threshold + // drift on model upgrade is the #1 documented regression mode. + assert.equal(CACHE_KEY_PREFIX, 'brief:emb:v1:text-3-small-512'); + }); +}); + +// ── stripSourceSuffix ───────────────────────────────────────────────────────── + +describe('stripSourceSuffix', () => { + it('strips "- reuters.com"', () => { + assert.equal( + stripSourceSuffix('US fighter jet shot down over Iran - reuters.com'), + 'US fighter jet shot down over Iran', + ); + }); + + it('strips "- Reuters"', () => { + assert.equal( + stripSourceSuffix('Downed planes spell new peril for Trump - Reuters'), + 'Downed planes spell new peril for Trump', + ); + }); + + it('strips "- AP News"', () => { + assert.equal( + stripSourceSuffix('US military jets hit in Iran war - AP News'), + 'US military jets hit in Iran war', + ); + }); + + it('strips "- apnews.com"', () => { + assert.equal( + stripSourceSuffix('US military jets hit in Iran war - apnews.com'), + 'US military jets hit in Iran war', + ); + }); + + it('preserves titles without source suffix', () => { + assert.equal( + stripSourceSuffix('Myanmar coup leader elected president'), + 'Myanmar coup leader elected president', + ); + }); +}); + +// ── Pure helpers ────────────────────────────────────────────────────────────── + +describe('extractTitleWords', () => { + it('drops stop-words and 1-2 char tokens', () => { + const words = extractTitleWords('The US is at war with Iran'); + // 'the', 'us', 'is', 'at', 'with' all drop. Remaining content + // words: 'war', 'iran'. + assert.ok(words.has('war')); + assert.ok(words.has('iran')); + assert.ok(!words.has('the')); + assert.ok(!words.has('us')); + assert.ok(!words.has('is')); + assert.ok(!words.has('at')); + }); + + it('applies stripSourceSuffix before tokenising', () => { + const words = extractTitleWords('Iran closes Hormuz - Reuters'); + assert.ok(words.has('iran')); + assert.ok(words.has('closes')); + assert.ok(words.has('hormuz')); + assert.ok(!words.has('reuters')); + }); +}); + +describe('jaccardSimilarity', () => { + it('returns 1 for identical Sets', () => { + const a = new Set(['iran', 'hormuz', 'strait']); + assert.equal(jaccardSimilarity(a, new Set(a)), 1); + }); + + it('returns 0 when either Set is empty', () => { + assert.equal(jaccardSimilarity(new Set(), new Set(['x'])), 0); + assert.equal(jaccardSimilarity(new Set(['x']), new Set()), 0); + }); + + it('is symmetric', () => { + const a = new Set(['a', 'b', 'c']); + const b = new Set(['b', 'c', 'd']); + assert.equal(jaccardSimilarity(a, b), jaccardSimilarity(b, a)); + }); +}); + +// ── deduplicateStoriesJaccard ───────────────────────────────────────────────── + +function story(title, score = 10, mentions = 1, hash = undefined) { + return { + title, + currentScore: score, + mentionCount: mentions, + sources: [], + severity: 'critical', + hash: hash ?? title.slice(0, 8), + }; +} + +describe('deduplicateStoriesJaccard', () => { + it('merges near-duplicate Reuters headlines about downed jet', () => { + const stories = [ + story('US fighter jet shot down over Iran, search underway for crew, US official says - reuters.com', 90), + story('US fighter jet shot down over Iran, search underway for crew, US officials say - reuters.com', 85), + story('US fighter jet shot down over Iran, search under way for crew member, US officials say - reuters.com', 80), + story('US fighter jet shot down over Iran, search under way for crew member, US officials say - Reuters', 75), + story('US fighter jet shot down over Iran, search underway for crew member, US officials say - Reuters', 70), + ]; + const result = deduplicateStoriesJaccard(stories); + assert.equal(result.length, 1, `Expected 1 cluster, got ${result.length}: ${result.map((r) => r.title).join(' | ')}`); + assert.equal(result[0].currentScore, 90); + assert.equal(result[0].mentionCount, 5); + }); + + it('keeps genuinely different stories separate', () => { + const stories = [ + story('US fighter jet shot down over Iran', 90), + story('Myanmar coup leader Min Aung Hlaing elected president', 80), + story('Brent oil spot price soars to $141', 70), + ]; + const result = deduplicateStoriesJaccard(stories); + assert.equal(result.length, 3); + }); + + it('merges same story reported by different outlets with different suffixes', () => { + const stories = [ + story('Downed planes spell new peril for Trump as Tehran hunts missing US pilot - Reuters', 90), + story('Downed planes spell new peril for Trump as Tehran hunts missing US pilot - reuters.com', 85), + ]; + const result = deduplicateStoriesJaccard(stories); + assert.equal(result.length, 1); + assert.equal(result[0].currentScore, 90); + }); + + it('merges stories with minor wording differences', () => { + const stories = [ + story('US rescues airman whose F-15 was downed in Iran, US officials say - Reuters', 90), + story('Iran says several enemy aircraft destroyed during US pilot rescue mission - Reuters', 80), + story('Trump, Israel pressure Iran ahead of deadline as search continues for missing US airman - Reuters', 70), + ]; + const result = deduplicateStoriesJaccard(stories); + // These are different enough events/angles that they should stay separate. + assert.ok(result.length >= 2, `Expected at least 2 clusters, got ${result.length}`); + }); + + it('carries mergedHashes from all clustered stories for source lookup', () => { + const stories = [ + story('US fighter jet shot down - reuters.com', 90, 1, 'hash_a'), + story('US fighter jet shot down - Reuters', 80, 1, 'hash_b'), + story('US fighter jet shot down - AP News', 70, 1, 'hash_c'), + ]; + const result = deduplicateStoriesJaccard(stories); + assert.equal(result.length, 1); + assert.deepEqual(result[0].mergedHashes, ['hash_a', 'hash_b', 'hash_c']); + }); + + it('preserves single stories without modification', () => { + const stories = [story('Only one story here', 50, 3)]; + const result = deduplicateStoriesJaccard(stories); + assert.equal(result.length, 1); + assert.equal(result[0].mentionCount, 3); + assert.deepEqual(result[0].mergedHashes, [stories[0].hash]); + }); +}); + +// ── Orchestrator kill-switch path ──────────────────────────────────────────── + +describe('brief-dedup orchestrator — jaccard kill switch', () => { + it('DIGEST_DEDUP_MODE=jaccard routes straight through the fallback', async () => { + const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs'); + let embedCalls = 0; + const stubEmbed = async () => { + embedCalls++; + throw new Error('embedBatch must NOT be called under MODE=jaccard'); + }; + const stories = [ + story('Iran closes Strait of Hormuz', 90, 1, 'h1'), + story('Iran shuts Strait of Hormuz - Reuters', 85, 1, 'h2'), + story('Myanmar coup leader elected president', 80, 1, 'h3'), + ]; + const out = await deduplicateStories(stories, { + env: { DIGEST_DEDUP_MODE: 'jaccard' }, + embedBatch: stubEmbed, + }); + assert.equal(embedCalls, 0); + const expected = deduplicateStoriesJaccard(stories); + assert.equal(out.length, expected.length); + for (let i = 0; i < out.length; i++) { + assert.equal(out[i].hash, expected[i].hash); + assert.deepEqual(out[i].mergedHashes, expected[i].mergedHashes); + assert.equal(out[i].mentionCount, expected[i].mentionCount); + } + }); + + it('returns [] for empty input without invoking Jaccard', async () => { + const { deduplicateStories } = await import('../scripts/lib/brief-dedup.mjs'); + let jaccardCalls = 0; + const stubJaccard = (s) => { + jaccardCalls++; + return deduplicateStoriesJaccard(s); + }; + const out = await deduplicateStories([], { + env: { DIGEST_DEDUP_MODE: 'jaccard' }, + jaccard: stubJaccard, + }); + assert.deepEqual(out, []); + assert.equal(jaccardCalls, 0); + }); +}); diff --git a/tests/digest-dedup.test.mjs b/tests/digest-dedup.test.mjs deleted file mode 100644 index 179f37e5a..000000000 --- a/tests/digest-dedup.test.mjs +++ /dev/null @@ -1,154 +0,0 @@ -/** - * Test: digest fuzzy deduplication merges near-duplicate stories. - * - * Run: node --test tests/digest-dedup.test.mjs - */ - -import { describe, it } from 'node:test'; -import assert from 'node:assert/strict'; -import { readFileSync } from 'node:fs'; -import { resolve, dirname } from 'node:path'; -import { fileURLToPath } from 'node:url'; - -const __dirname = dirname(fileURLToPath(import.meta.url)); -const src = readFileSync( - resolve(__dirname, '..', 'scripts', 'seed-digest-notifications.mjs'), - 'utf-8', -); - -// ── Extract the dedup functions via dynamic evaluation ──────────────────────── -// We extract the pure functions (no side-effects, no imports) to test them. - -const STOP_WORDS_BLOCK = src.match(/const STOP_WORDS = new Set\(\[[\s\S]*?\]\);/)?.[0]; -const stripSourceSuffix = src.match(/function stripSourceSuffix\(title\) \{[\s\S]*?\n\}/)?.[0]; -const extractTitleWords = src.match(/function extractTitleWords\(title\) \{[\s\S]*?\n\}/)?.[0]; -const jaccardSimilarity = src.match(/function jaccardSimilarity\(setA, setB\) \{[\s\S]*?\n\}/)?.[0]; -const deduplicateStories = src.match(/function deduplicateStories\(stories\) \{[\s\S]*?\n\}/)?.[0]; - -assert.ok(STOP_WORDS_BLOCK, 'STOP_WORDS not found in source'); -assert.ok(stripSourceSuffix, 'stripSourceSuffix not found in source'); -assert.ok(extractTitleWords, 'extractTitleWords not found in source'); -assert.ok(jaccardSimilarity, 'jaccardSimilarity not found in source'); -assert.ok(deduplicateStories, 'deduplicateStories not found in source'); - -const mod = {}; -new Function('mod', ` - ${STOP_WORDS_BLOCK} - ${stripSourceSuffix} - ${extractTitleWords} - ${jaccardSimilarity} - ${deduplicateStories} - mod.stripSourceSuffix = stripSourceSuffix; - mod.extractTitleWords = extractTitleWords; - mod.jaccardSimilarity = jaccardSimilarity; - mod.deduplicateStories = deduplicateStories; -`)(mod); - -// ── Tests ───────────────────────────────────────────────────────────────────── - -describe('stripSourceSuffix', () => { - it('strips "- reuters.com"', () => { - assert.equal( - mod.stripSourceSuffix('US fighter jet shot down over Iran - reuters.com'), - 'US fighter jet shot down over Iran', - ); - }); - - it('strips "- Reuters"', () => { - assert.equal( - mod.stripSourceSuffix('Downed planes spell new peril for Trump - Reuters'), - 'Downed planes spell new peril for Trump', - ); - }); - - it('strips "- AP News"', () => { - assert.equal( - mod.stripSourceSuffix('US military jets hit in Iran war - AP News'), - 'US military jets hit in Iran war', - ); - }); - - it('strips "- apnews.com"', () => { - assert.equal( - mod.stripSourceSuffix('US military jets hit in Iran war - apnews.com'), - 'US military jets hit in Iran war', - ); - }); - - it('preserves titles without source suffix', () => { - assert.equal( - mod.stripSourceSuffix('Myanmar coup leader elected president'), - 'Myanmar coup leader elected president', - ); - }); -}); - -describe('deduplicateStories', () => { - function story(title, score = 10, mentions = 1, hash = undefined) { - return { title, currentScore: score, mentionCount: mentions, sources: [], severity: 'critical', hash: hash ?? title.slice(0, 8) }; - } - - it('merges near-duplicate Reuters headlines about downed jet', () => { - const stories = [ - story('US fighter jet shot down over Iran, search underway for crew, US official says - reuters.com', 90), - story('US fighter jet shot down over Iran, search underway for crew, US officials say - reuters.com', 85), - story('US fighter jet shot down over Iran, search under way for crew member, US officials say - reuters.com', 80), - story('US fighter jet shot down over Iran, search under way for crew member, US officials say - Reuters', 75), - story('US fighter jet shot down over Iran, search underway for crew member, US officials say - Reuters', 70), - ]; - const result = mod.deduplicateStories(stories); - assert.equal(result.length, 1, `Expected 1 cluster, got ${result.length}: ${result.map(r => r.title).join(' | ')}`); - assert.equal(result[0].currentScore, 90); - assert.equal(result[0].mentionCount, 5); - }); - - it('keeps genuinely different stories separate', () => { - const stories = [ - story('US fighter jet shot down over Iran', 90), - story('Myanmar coup leader Min Aung Hlaing elected president', 80), - story('Brent oil spot price soars to $141', 70), - ]; - const result = mod.deduplicateStories(stories); - assert.equal(result.length, 3); - }); - - it('merges same story reported by different outlets with different suffixes', () => { - const stories = [ - story('Downed planes spell new peril for Trump as Tehran hunts missing US pilot - Reuters', 90), - story('Downed planes spell new peril for Trump as Tehran hunts missing US pilot - reuters.com', 85), - ]; - const result = mod.deduplicateStories(stories); - assert.equal(result.length, 1); - assert.equal(result[0].currentScore, 90); - }); - - it('merges stories with minor wording differences', () => { - const stories = [ - story('US rescues airman whose F-15 was downed in Iran, US officials say - Reuters', 90), - story('Iran says several enemy aircraft destroyed during US pilot rescue mission - Reuters', 80), - story('Trump, Israel pressure Iran ahead of deadline as search continues for missing US airman - Reuters', 70), - ]; - const result = mod.deduplicateStories(stories); - // These are different enough events/angles that they should stay separate - assert.ok(result.length >= 2, `Expected at least 2 clusters, got ${result.length}`); - }); - - it('carries mergedHashes from all clustered stories for source lookup', () => { - const stories = [ - story('US fighter jet shot down - reuters.com', 90, 1, 'hash_a'), - story('US fighter jet shot down - Reuters', 80, 1, 'hash_b'), - story('US fighter jet shot down - AP News', 70, 1, 'hash_c'), - ]; - const result = mod.deduplicateStories(stories); - assert.equal(result.length, 1); - assert.deepEqual(result[0].mergedHashes, ['hash_a', 'hash_b', 'hash_c']); - }); - - it('preserves single stories without modification', () => { - const stories = [story('Only one story here', 50, 3)]; - const result = mod.deduplicateStories(stories); - assert.equal(result.length, 1); - assert.equal(result[0].mentionCount, 3); - assert.deepEqual(result[0].mergedHashes, [stories[0].hash]); - }); -}); diff --git a/todos/193-complete-p1-dedup-regex-and-redis-pipeline-duplication.md b/todos/193-complete-p1-dedup-regex-and-redis-pipeline-duplication.md new file mode 100644 index 000000000..124c8271f --- /dev/null +++ b/todos/193-complete-p1-dedup-regex-and-redis-pipeline-duplication.md @@ -0,0 +1,82 @@ +--- +status: complete +priority: p1 +issue_id: 193 +tags: [code-review, digest-dedup, phase-a, maintenance, dry] +dependencies: [] +--- + +# Duplicated regex + duplicated Redis pipeline helper in digest-dedup Phase A + +## Problem Statement + +Two convergent duplication findings from the Phase A review (commit `cdd7a124c`) — both are silent-divergence risks because the unit tests cover each module in isolation. + +1. **`stripSourceSuffix` regex appears verbatim in TWO files.** + - `scripts/lib/brief-embedding.mjs:60-64` inside `normalizeForEmbedding` + - `scripts/lib/brief-dedup-jaccard.mjs:32-36` as the exported `stripSourceSuffix` + - Adding a new outlet (e.g. `Bloomberg`) to one and not the other means the veto/embed input and the Jaccard fallback input quietly drift. The plan's "normalization contract" explicitly calls out that these MUST agree. + +2. **`defaultRedisPipeline` reimplemented in TWO new files.** + - `scripts/lib/brief-dedup.mjs:74-94` + - `scripts/lib/brief-embedding.mjs:77-97` + - The repo already exports a canonical `redisPipeline()` from `api/_upstash-json.js:88` (imported by `seed-digest-notifications.mjs:32`). The new lib/ modules avoid importing from `api/` on purpose — but the existing `scripts/lib/` convention is to share via `scripts/lib/_*.mjs` helpers. + +## Findings + +- **Embedding regex collision**: `brief-embedding.mjs:60` and `brief-dedup-jaccard.mjs:33` keep the same outlet allow-list in two places. +- **Pipeline helper trio**: three copies of essentially the same 20-line `fetch('{url}/pipeline', …)` exist (the two above + the canonical one). Any fix to timeout / User-Agent / error shape has to touch all three. + +## Proposed Solutions + +### Option 1 — consolidate into brief-dedup-consts.mjs (lightest) +Move `stripSourceSuffix` + its outlet list into `brief-dedup-consts.mjs` as the source of truth. Have both `brief-embedding.mjs` and `brief-dedup-jaccard.mjs` import from there. Extract `defaultRedisPipeline` into `scripts/lib/_upstash-pipeline.mjs`. + +**Pros:** single-sourced, small diff. +**Cons:** mixes data + function in consts (plan called consts "pure data"); may need a `scripts/lib/_wire-suffixes.mjs` instead. +**Effort:** Small +**Risk:** Low + +### Option 2 — dedicated shared-helpers module +Add `scripts/lib/_dedup-shared.mjs` exporting `stripSourceSuffix`, `defaultRedisPipeline`, `normalizeForEmbedding` (which currently calls stripSourceSuffix inline). + +**Pros:** cleaner separation. +**Cons:** another file. +**Effort:** Small +**Risk:** Low + +### Option 3 — defer to a follow-up (not recommended) +Ship Phase A as-is; clean up in Phase B. + +**Pros:** ship faster. +**Cons:** two independent outlet lists on `main` makes the first production drift-incident hard to diagnose. +**Effort:** zero now +**Risk:** Medium — divergence is silent. + +## Recommended Action +_To be filled during triage._ + +## Technical Details + +Affected files: +- `scripts/lib/brief-embedding.mjs` +- `scripts/lib/brief-dedup-jaccard.mjs` +- `scripts/lib/brief-dedup.mjs` +- `scripts/lib/brief-dedup-consts.mjs` +- `api/_upstash-json.js` (canonical `redisPipeline`) + +No schema or DB changes. + +## Acceptance Criteria +- [ ] `stripSourceSuffix` regex exists in exactly ONE place; both consumer sites import from there. +- [ ] `defaultRedisPipeline` (or equivalent) exists in exactly ONE place within the new modules. +- [ ] Regression test asserts `normalizeForEmbedding("Foo - Reuters") === "foo"` and `normalizeForEmbedding` uses the same outlet list the Jaccard path uses (same-file import, guaranteed). +- [ ] `npm run test:data` still 5825/5825. + +## Work Log +_Empty — awaiting triage._ + +## Resources +- Review commit: `cdd7a124c` +- Plan: `docs/plans/2026-04-19-001-feat-embedding-based-story-dedup-plan.md` +- Reviewers: kieran-typescript-reviewer, security-sentinel, code-simplicity-reviewer, architecture-strategist diff --git a/todos/194-complete-p1-embedding-timeout-floor-and-negative-budget.md b/todos/194-complete-p1-embedding-timeout-floor-and-negative-budget.md new file mode 100644 index 000000000..c1b37b589 --- /dev/null +++ b/todos/194-complete-p1-embedding-timeout-floor-and-negative-budget.md @@ -0,0 +1,78 @@ +--- +status: complete +priority: p1 +issue_id: 194 +tags: [code-review, digest-dedup, phase-a, correctness, performance] +dependencies: [] +--- + +# Embedding `AbortSignal.timeout` floor + negative-budget fast-path + +## Problem Statement + +In `scripts/lib/brief-embedding.mjs:153` the batched embeddings call uses: + +```js +signal: AbortSignal.timeout(Math.max(250, timeoutMs)), +``` + +Two correctness concerns surfaced convergently by the perf + kieran-ts reviewers: + +1. **Floor allows total wall-clock overshoot.** The orchestrator's `DIGEST_DEDUP_WALL_CLOCK_MS` contract says "hard cap at 45s or abort the whole batch". If `deadline - nowImpl()` returns 40ms (cache lookup was slow), the floor lengthens the fetch to 250ms — the cron tick silently runs 210ms past budget. +2. **Negative budget fast-pathed to a guaranteed timeout.** If cache lookup already exceeded the deadline, `timeoutMs` goes negative. `Math.max(250, negative)` → 250ms, so we open an HTTP connection to OpenRouter that is guaranteed to time out. Wastes a network round-trip and an OpenRouter quota slot on a request we already know is doomed. + +Neither is a functional bug (all-or-nothing fallback still works), but both are incorrect by the "all-or-nothing" contract. + +## Findings + +- `brief-embedding.mjs:153` `Math.max(250, timeoutMs)` — discussed above. +- `brief-embedding.mjs:243` caller passes `timeoutMs: deadline - nowImpl()` without re-checking deadline. Line 242 throws on deadline-exceeded BEFORE cache lookup but not after, so the gap between cache lookup and API call is unchecked. + +## Proposed Solutions + +### Option 1 — pre-check and remove floor (recommended) +```js +if (timeoutMs <= 0) throw new EmbeddingTimeoutError(); +signal: AbortSignal.timeout(timeoutMs), +``` + +**Pros:** correct by construction; skips pointless round-trip. +**Cons:** 250ms floor was defensive for tiny-positive values (e.g. 15ms) where a real OpenRouter RTT + TLS handshake wouldn't fit anyway. But the orchestrator can't do better than the math says. +**Effort:** Small +**Risk:** Low — every test still passes (scenario 2 throws synchronously via the stub). + +### Option 2 — re-check deadline in orchestrator before embed call +In `brief-dedup.mjs:254-261`, bail to fallback before calling `embedImpl` when `nowImpl() >= deadline`. + +**Pros:** symmetry with cache-lookup check. +**Cons:** pushes contract knowledge up one layer. +**Effort:** Small +**Risk:** Low + +### Option 3 — keep the floor, log once on overshoot +Emit a `warn` when `timeoutMs < 250` so operators see it. + +**Pros:** visibility. +**Cons:** doesn't fix the overshoot. +**Effort:** Small +**Risk:** Low + +## Recommended Action +_Option 1 + Option 2 both; they're tiny._ + +## Technical Details +- `scripts/lib/brief-embedding.mjs:153` — the floor +- `scripts/lib/brief-embedding.mjs:240-242` — the call site's deadline math +- `scripts/lib/brief-dedup.mjs:247-254` — orchestrator embed call + +## Acceptance Criteria +- [ ] Passing `wallClockMs: 10` through a stubbed slow cache (e.g. now() monotonic past deadline) throws `EmbeddingTimeoutError` without invoking `fetch`. +- [ ] Passing `wallClockMs: 100` with a stub cache that takes 150ms throws `EmbeddingTimeoutError`. +- [ ] All existing 64 dedup tests still green. + +## Work Log +_Empty — awaiting triage._ + +## Resources +- Review commit: `cdd7a124c` +- Reviewers: performance-oracle, kieran-typescript-reviewer diff --git a/todos/195-complete-p1-dead-env-getters-in-dedup-consts.md b/todos/195-complete-p1-dead-env-getters-in-dedup-consts.md new file mode 100644 index 000000000..8dd69d5c3 --- /dev/null +++ b/todos/195-complete-p1-dead-env-getters-in-dedup-consts.md @@ -0,0 +1,73 @@ +--- +status: complete +priority: p1 +issue_id: 195 +tags: [code-review, digest-dedup, phase-a, dead-code, yagni] +dependencies: [] +--- + +# Dead env-getter API in brief-dedup-consts.mjs + +## Problem Statement + +`scripts/lib/brief-dedup-consts.mjs:37-58` exports five env-reader functions: +- `getMode()` +- `isRemoteEmbedEnabled()` +- `isEntityVetoEnabled()` +- `getCosineThreshold()` +- `getWallClockMs()` + +None of them are imported by any other file (`grep` across `scripts/`, `server/`, `api/`, `tests/` returns zero hits). The orchestrator (`scripts/lib/brief-dedup.mjs:51-70`) reimplements every single one inline in `readOrchestratorConfig`. Flagged convergently by the simplicity + architecture + kieran-typescript reviewers. + +Two parallel parsers for the same five env vars is the classic "one will drift" setup. And the consts module's own header comment claims it's "Pure data, no network" — env readers violate that boundary. + +## Findings +- `brief-dedup-consts.mjs:36-58`: 22 LOC of unreachable code. +- `brief-dedup.mjs:51-70`: `readOrchestratorConfig` is the only real env-reader. +- No drift risk today (no callers), but guaranteed drift the first time someone adds a knob to one path and forgets the other. + +## Proposed Solutions + +### Option 1 — delete the dead getters (recommended) +Remove `getMode`, `isRemoteEmbedEnabled`, `isEntityVetoEnabled`, `getCosineThreshold`, `getWallClockMs` from consts. Leave consts as pure data + the `__constants` bag. + +**Pros:** removes 22 LOC of unreachable code; matches the module's stated purpose. +**Cons:** none. +**Effort:** Small +**Risk:** Zero (no callers). + +### Option 2 — delete the inline `readOrchestratorConfig`, have it call the getters +Keep the consts-module getters, wire `readOrchestratorConfig` to compose them. + +**Pros:** one env-parse implementation. +**Cons:** muddies consts; every call now does 5 separate env reads instead of one pass; the module-init vs per-call semantics get surprising. +**Effort:** Small +**Risk:** Low + +### Option 3 — ship as-is, add a TODO comment +Flag for Phase B cleanup. + +**Pros:** zero churn now. +**Cons:** ships known dead code. +**Effort:** Tiny +**Risk:** Low + +## Recommended Action +_Option 1. Deleting a dead surface that nothing imports is the safest change._ + +## Technical Details +- Delete lines 36-58 of `scripts/lib/brief-dedup-consts.mjs`. +- Keep the static constants + `__constants` bag. +- No test changes required. + +## Acceptance Criteria +- [ ] `scripts/lib/brief-dedup-consts.mjs` exports only pure constants + the `__constants` bag. +- [ ] `grep -r "getMode\b\|isRemoteEmbedEnabled\b\|isEntityVetoEnabled\b\|getCosineThreshold\b\|getWallClockMs\b"` finds 0 hits outside the orchestrator's inline body. +- [ ] All 64 dedup tests still green. + +## Work Log +_Empty — awaiting triage._ + +## Resources +- Review commit: `cdd7a124c` +- Reviewers: code-simplicity-reviewer, architecture-strategist, kieran-typescript-reviewer diff --git a/todos/196-complete-p2-orchestrator-cleanup-bundle.md b/todos/196-complete-p2-orchestrator-cleanup-bundle.md new file mode 100644 index 000000000..b090aa2b1 --- /dev/null +++ b/todos/196-complete-p2-orchestrator-cleanup-bundle.md @@ -0,0 +1,78 @@ +--- +status: complete +priority: p2 +issue_id: 196 +tags: [code-review, digest-dedup, phase-a, cleanup, quality] +dependencies: [] +--- + +# Orchestrator cleanup bundle (re-exports, materializeCluster, vetoWrapper, double-Jaccard, warn fields, mode validation) + +## Problem Statement + +Six independent but small `brief-dedup.mjs` orchestrator issues found by the reviewers — grouped because they're all <10 LOC each and one PR makes more sense than six. + +### 1. Re-exports create import-graph ambiguity +`brief-dedup.mjs:312-317` re-exports `deduplicateStoriesJaccard`, `normalizeForEmbedding`, and `CACHE_TTL_SECONDS`. `normalizeForEmbedding` can now be pulled from both `brief-embedding.mjs` AND `brief-dedup.mjs`; the tools correctly pull from the source module but new callers can drift. + +### 2. `materializeCluster` duplicates Jaccard's representative logic +`brief-dedup.mjs:107-117` and `brief-dedup-jaccard.mjs:90-98` independently sort members by `[currentScore DESC, mentionCount DESC]`, sum mentionCount, project `mergedHashes`. Same contract in two places. + +### 3. `clusterWithEntityVeto` is a 6-line wrapper used once +`brief-dedup-embed.mjs:167-179` wraps `completeLinkCluster` with a pre-baked vetoFn. Called from exactly one site (`brief-dedup.mjs:261`). Inlining at the call site removes a function + export + indirection layer. + +### 4. Shadow mode runs Jaccard twice +`brief-dedup.mjs:275` calls `jaccardClusterHashesFor(stories)` to derive cluster-hash arrays for the diff; line 295 calls `jaccard(stories)` again to produce the returned value. Both are Jaccard(stories) in original order — compute once, derive `mergedHashes` from the returned reps, return them directly. + +### 5. `warn` log on fallback lacks `err.name` for filtering +`brief-dedup.mjs:303-306` emits `"[digest] dedup embed path failed, falling back to Jaccard: {msg}"`. Operators can't grep `reason=timeout` vs `reason=provider_5xx`. Add `err.name` / `err.status` to the structured log. + +### 6. Invalid `DIGEST_DEDUP_MODE` silently falls to `'jaccard'` +`brief-dedup.mjs:52-53` — if someone sets `DIGEST_DEDUP_MODE=embbed` (typo), the orchestrator accepts it as jaccard with no warn. Operator expects embed mode, gets Jaccard. Emit a one-shot warn when the raw value is truthy but unrecognised. + +## Findings +All six are convergent findings across kieran-ts, architecture, simplicity, and perf reviewers. Each is a <10 LOC change. + +## Proposed Solutions + +### Option 1 — single "orchestrator polish" commit covering all 6 (recommended) +One small diff per subsection; all test-covered by existing suites. + +**Pros:** clear unit of work; one review. +**Cons:** touches 3 files. +**Effort:** Small +**Risk:** Low + +### Option 2 — split into three commits +(a) duplication (re-exports + materializeCluster + vetoWrapper); (b) shadow double-Jaccard; (c) observability (warn fields + mode validation). + +**Pros:** easier to revert one if something surprises. +**Cons:** more churn. +**Effort:** Small +**Risk:** Low + +## Recommended Action +_To be filled during triage._ + +## Technical Details + +Affected files: +- `scripts/lib/brief-dedup.mjs` — re-exports, materializeCluster, shadow double-run, warn log, mode validation +- `scripts/lib/brief-dedup-embed.mjs` — delete `clusterWithEntityVeto` +- `scripts/lib/brief-dedup-jaccard.mjs` — extract shared representative helper (for #2) + +## Acceptance Criteria +- [ ] `brief-dedup.mjs` exports exactly `deduplicateStories` + `readOrchestratorConfig` + the types; no re-exports. +- [ ] One shared helper for representative selection; used by both Jaccard fallback and orchestrator materialization. +- [ ] `clusterWithEntityVeto` is deleted; orchestrator inlines the veto wiring. +- [ ] Shadow mode runs Jaccard exactly once per tick; disagreement diff is derived from that single run. +- [ ] Fallback warn line contains `reason={timeout|provider|other}` (or equivalent structured field). +- [ ] Unrecognised `DIGEST_DEDUP_MODE` value emits `warn` once per cron run with the raw value masked. +- [ ] All 64 dedup tests pass; add one new test per sub-fix. + +## Work Log +_Empty — awaiting triage._ + +## Resources +- Review commit: `cdd7a124c` +- Reviewers: all 5 (kieran-typescript, security-sentinel, performance-oracle, architecture-strategist, code-simplicity-reviewer) diff --git a/todos/197-complete-p2-workflow-and-shadow-archive-hardening.md b/todos/197-complete-p2-workflow-and-shadow-archive-hardening.md new file mode 100644 index 000000000..4a7fb40f3 --- /dev/null +++ b/todos/197-complete-p2-workflow-and-shadow-archive-hardening.md @@ -0,0 +1,83 @@ +--- +status: complete +priority: p2 +issue_id: 197 +tags: [code-review, digest-dedup, phase-a, security, ci] +dependencies: [] +--- + +# Nightly workflow + shadow-archive hardening + +## Problem Statement + +Security-sentinel flagged two hardening items in the new CI workflow + shadow-mode archive — both are defence-in-depth, not exploitable today, but should land before the Phase C flip that actually writes archives. + +### 1. `.github/workflows/dedup-golden-pairs.yml:59` — unquoted heredoc +```yaml +BODY=$(cat <:<8-char contentHash>`. ISO timestamps + cadence make the prefix enumerable; story IDs are safe hashes but `normalizedTitles` is plaintext wire. Per data classification titles are public, so there's no secret leak — but two hardening moves would make the archive more defensible: +- Drop `normalizedTitles` from the archive; have `shadow-sample.mjs` re-read from `story:track:v1:` at draw time. +- Document the `brief:dedup:shadow:v1:*` prefix in the Upstash access-controls runbook so future changes know it holds wire titles. + +### 3. `shadow-sample.mjs:53` — no command allowlist on the Upstash helper +The helper accepts `(command, ...params)` with `encodeURIComponent` on each segment. Safe today because only the tool calls it with `SCAN`/`GET`. If a future caller passes user input into `params`, the percent-encoding still protects path structure — but a dangerous command (`FLUSHDB`, `DEL`) could land. Hard-allowlist the `command` to `['SCAN', 'GET', 'EXISTS']`. + +## Findings +All three are reviewer-flagged defensive hardening. None affect the Phase A no-op ship path. + +## Proposed Solutions + +### Option 1 — minimum fix (recommended) +- Quote the heredoc delimiter in the workflow (1 char). +- Add command allowlist in `shadow-sample.mjs` (6 lines). +- Leave `normalizedTitles` in the archive; add a comment documenting the wire-text classification. + +**Pros:** small, focused, safe; defers the bigger archive-shape change. +**Cons:** doesn't eliminate the wire-text enumeration concern, only documents it. +**Effort:** Small +**Risk:** Low + +### Option 2 — full hardening +All three items above. + +**Pros:** maximal defence. +**Cons:** archive-shape change touches `shadow-sample.mjs` (title lookup) and the orchestrator (drop field); slightly bigger diff. +**Effort:** Small-Medium +**Risk:** Low + +### Option 3 — defer to Phase C +Ship Phase A as-is; harden before flipping to `shadow` mode. + +**Pros:** zero Phase A churn. +**Cons:** the heredoc is a weak-positive attack surface; the workflow could run before Phase C if the manual `workflow_dispatch` is triggered. +**Effort:** zero now +**Risk:** Low-Medium + +## Recommended Action +_To be filled during triage._ + +## Technical Details +- `.github/workflows/dedup-golden-pairs.yml:59` +- `scripts/lib/brief-dedup.mjs:170-180` (archive shape) +- `scripts/tools/shadow-sample.mjs:53` (command allowlist) + +## Acceptance Criteria +- [ ] Heredoc delimiter is `<<'EOF'` (quoted). +- [ ] `shadow-sample.mjs`'s Upstash helper rejects any command not in a hardcoded allowlist. +- [ ] (Optional) Shadow archive no longer stores plaintext titles; sampler rehydrates from `story:track:v1:*`. + +## Work Log +_Empty — awaiting triage._ + +## Resources +- Review commit: `cdd7a124c` +- Reviewer: security-sentinel diff --git a/todos/198-complete-p2-test-and-observability-polish.md b/todos/198-complete-p2-test-and-observability-polish.md new file mode 100644 index 000000000..e344613ce --- /dev/null +++ b/todos/198-complete-p2-test-and-observability-polish.md @@ -0,0 +1,66 @@ +--- +status: complete +priority: p2 +issue_id: 198 +tags: [code-review, digest-dedup, phase-a, tests, observability] +dependencies: [] +--- + +# Test + observability polish on the Phase A embed path + +## Problem Statement + +Two small quality items the reviewers flagged. + +### 1. Embedding test Scenario 2 only asserts length, not content +`tests/brief-dedup-embedding.test.mjs:164` — `assert.equal(out.length, expected.length)`. A regression that changed Jaccard's merging shape but kept cluster count constant would slip through. The equivalent parity test in `tests/brief-dedup-jaccard.test.mjs:214-220` does a deep per-cluster compare (hash, mergedHashes, mentionCount). Match that pattern in the timeout/outage fallback scenarios. + +### 2. Missing API-key path logs an exception every cron tick +If `DIGEST_DEDUP_MODE=embed` is set but `OPENROUTER_API_KEY` is empty (rotated, forgotten, or misconfigured after a Railway env edit), the orchestrator catches the `EmbeddingProviderError('OPENROUTER_API_KEY not configured')` and warns. That's correct behaviour for the all-or-nothing fallback, but it emits the same warn every tick — noisy in Sentry/log search. + +Two options: +- Add a pre-flight in `readOrchestratorConfig` that checks the key and short-circuits to jaccard with a ONE-SHOT warn (per process). +- Add `reason=no_api_key` to the warn line so it's filterable. + +Option (a) is lower-volume, option (b) is easier to implement and scoped to the overlapping P2 #196 subsection 5 ("warn line missing err.name"). Pick one. + +## Findings +Both items surfaced from the kieran-ts reviewer. + +## Proposed Solutions + +### Option 1 — tighten both (recommended) +- Deep-equal the fallback clusters in Scenario 2 + Scenario 3 (same 6-line pattern as jaccard orchestrator test). +- Add a `reason=` field to the fallback warn (dovetails with #196.5). Skip the one-shot warn; the structured field is enough. + +**Pros:** small, covered by existing harness. +**Cons:** none. +**Effort:** Small +**Risk:** Low + +### Option 2 — one-shot warn per process +Module-scope `_apiKeyWarned` flag; log only on first miss. + +**Pros:** quiet in log. +**Cons:** cross-process: Railway cron each tick forks a new node process, so every cron tick IS a first miss — the flag doesn't help. Conclusion: do NOT implement this; rely on `reason=` labels instead. +**Effort:** trivial +**Risk:** wasted — the flag doesn't persist across cron tick lifetimes. + +## Recommended Action +_Option 1._ + +## Technical Details +- `tests/brief-dedup-embedding.test.mjs:164` (Scenario 2) +- `tests/brief-dedup-embedding.test.mjs:196` (Scenario 3) +- `scripts/lib/brief-dedup.mjs:303-306` (warn log) + +## Acceptance Criteria +- [ ] Scenarios 2 and 3 deep-equal the output clusters against the Jaccard expected shape (hash, mergedHashes, mentionCount). +- [ ] Fallback warn line includes a filterable `reason=` field. + +## Work Log +_Empty — awaiting triage._ + +## Resources +- Review commit: `cdd7a124c` +- Reviewer: kieran-typescript-reviewer diff --git a/todos/199-complete-p3-phase-a-nits-bundle.md b/todos/199-complete-p3-phase-a-nits-bundle.md new file mode 100644 index 000000000..a630eda87 --- /dev/null +++ b/todos/199-complete-p3-phase-a-nits-bundle.md @@ -0,0 +1,84 @@ +--- +status: complete +priority: p3 +issue_id: 199 +tags: [code-review, digest-dedup, phase-a, nits] +dependencies: [] +--- + +# Phase A nits bundle (constants bag, mocked-golden overlap, apiKey dep rename, tools/ convention, gazetteer JSON, unique-hash precondition) + +## Problem Statement + +Six P3 items across the reviewers. None of them block merge; several are "consider in Phase B/C". + +### 1. `__constants` test-bag is redundant +`scripts/lib/brief-dedup-consts.mjs:64-74` exposes a frozen object used by `tests/brief-dedup-jaccard.test.mjs:27,33,37` to read three constants. A direct named import is identical and shorter. The bag's original purpose ("replace the regex-extraction harness") is already served by normal imports. + +### 2. `brief-dedup-golden.test.mjs` (138 LOC) largely mirrors scenarios in `brief-dedup-embedding.test.mjs` +The mocked-embedder path tests with crafted vectors are structurally forced — they're testing the mock. The real value of "golden pairs" is the LIVE-embedder nightly workflow + `tests/fixtures/brief-dedup-golden-pairs.json`. Consider removing the mocked test file; keep the fixture + workflow. + +### 3. `deps.apiKey` override in `brief-embedding.mjs:228` shares the shape with prod wiring +A future caller spreading unvalidated user data into `deps` could inject an attacker-controlled API key that goes to `openrouter.ai` in the `Authorization: Bearer` header. Currently unreachable. Rename to `deps._apiKey` or gate on a test-only `Symbol` to make misuse noisy. + +### 4. `scripts/tools/` is a new top-level directory +No prior precedent in the repo; ops scripts have historically been `scripts/backfill-*.mjs`, `scripts/benchmark-*.mjs` at top level. A `tools/` subdir is defensible for these three related utilities but worth documenting in `AGENTS.md` ("anything not on the cron lives in tools/"). + +### 5. `entity-gazetteer.mjs` as two inline `Set`s +Fine for v1. For auditability and NER-pluggability consider promoting `LOCATION_GAZETTEER` / `COMMON_CAPITALIZED` to `scripts/shared/location-gazetteer.json` with a tiny loader. Non-code reviewers can diff entries; easier to publish to `shared/` if a future CJS consumer needs it. + +### 6. `diffClustersByHash` assumes unique story hashes +`brief-dedup.mjs:126-148` builds hash→cluster-id maps. If two stories share a hash, the second silently overwrites the first. Safe today (upstream dedup guarantees uniqueness) but a JSDoc `@pre stories have unique .hash` would future-proof. + +## Findings +All six are convergent or single-reviewer items. Each is independent. + +## Proposed Solutions + +### Option 1 — address as Phase B/C prep work (recommended) +Do #1 and #3 now (tiny renames / deletions), defer #2 and #5 until Phase B/C validates the feature is shipping, document #4 in AGENTS.md during the next cleanup pass, add #6 as a one-line JSDoc. + +**Pros:** reflects the risk profile; doesn't churn Phase A for cosmetic wins. +**Cons:** leaves some nits on `main`. +**Effort:** Small for the now-items; negligible otherwise. +**Risk:** Low + +### Option 2 — do everything now +Single "Phase A nits" commit. + +**Pros:** closes all the follow-ups. +**Cons:** #2 and #5 touch enough surface that they should have their own PR. +**Effort:** Medium +**Risk:** Low + +### Option 3 — defer all +Track for Phase B. + +**Pros:** zero churn. +**Cons:** accumulates debt. +**Effort:** zero +**Risk:** Low + +## Recommended Action +_To be filled during triage._ + +## Technical Details +- `scripts/lib/brief-dedup-consts.mjs:64-74` (#1) +- `tests/brief-dedup-golden.test.mjs` (#2, whole file) +- `scripts/lib/brief-embedding.mjs:228` (#3) +- `AGENTS.md` + `scripts/tools/*` (#4) +- `scripts/lib/entity-gazetteer.mjs` → `scripts/shared/location-gazetteer.json` (#5) +- `scripts/lib/brief-dedup.mjs:126-148` (#6 — one-line JSDoc) + +## Acceptance Criteria +- [ ] (#1) `__constants` bag removed; jaccard test imports constants directly. +- [ ] (#3) `deps.apiKey` renamed to a less spreadable name. +- [ ] (#6) `diffClustersByHash` has an `@pre` JSDoc about unique hashes. +- [ ] (#2 / #4 / #5) captured as Phase B/C todos or deferred with rationale. + +## Work Log +_Empty — awaiting triage._ + +## Resources +- Review commit: `cdd7a124c` +- Reviewers: code-simplicity, security-sentinel, architecture-strategist, kieran-typescript