diff --git a/scripts/brief-quality-report.mjs b/scripts/brief-quality-report.mjs new file mode 100644 index 000000000..ccf7d90a5 --- /dev/null +++ b/scripts/brief-quality-report.mjs @@ -0,0 +1,370 @@ +#!/usr/bin/env node +// Daily brief-quality dashboard. +// +// Pulls the most recent N replay-log ticks for a (variant, lang, +// sensitivity, date) tuple and computes a single quality_score plus +// the component metrics that produced it. Run daily; watch the trend. +// +// "Are we getting better" loop: +// 1. Run this script, record the quality_score. +// 2. Make a config change (env flip, code merge, threshold tune). +// 3. Wait one cron tick, re-run, compare. +// 4. If quality_score went down, revert. +// +// Metrics computed: +// - pair_recall_cluster — % of "should-cluster" labeled pairs that +// end up in the same topic at the active threshold +// - false_adjacency — % of "should-separate" labeled pairs that end +// up adjacent (false positive) +// - cap_truncation_rate — % of qualified stories truncated by the +// MAX_STORIES_PER_USER cap (computed from production drop logs if +// supplied via stdin, else estimated as max(0, in - 16)/in from +// replay record counts) +// - multi_member_topic_share — % of topics with size > 1 +// - quality_score — composite (recall × 0.6 + (1-false-adj) × 0.3 + +// multi-member × 0.1) +// +// Usage: +// node --import tsx/esm scripts/brief-quality-report.mjs # today, full:en:all +// node --import tsx/esm scripts/brief-quality-report.mjs --rule full:en:critical # specific rule +// node --import tsx/esm scripts/brief-quality-report.mjs --date 2026-04-24 # specific date +// node --import tsx/esm scripts/brief-quality-report.mjs --json # machine-readable +// +// Pipe production drop logs for accurate cap-truncation: +// railway logs --service scripts-cron-digest-notifications | grep 'brief filter drops' | \ +// node --import tsx/esm scripts/brief-quality-report.mjs --drop-lines-stdin + +import { readFileSync } from 'node:fs'; +import { resolve, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { loadEnvFile, getRedisCredentials } from './_seed-utils.mjs'; +import { singleLinkCluster } from './lib/brief-dedup-embed.mjs'; +import { normalizeForEmbedding } from './lib/brief-embedding.mjs'; + +loadEnvFile(import.meta.url); + +const REPLAY_KEY_PREFIX = 'digest:replay-log:v1'; + +function parseArgs(argv) { + const out = { + date: new Date().toISOString().slice(0, 10), + rule: 'full:en:all', + json: false, + dropLinesStdin: false, + }; + for (let i = 2; i < argv.length; i++) { + const a = argv[i]; + if (a === '--date') out.date = argv[++i]; + else if (a === '--rule') out.rule = argv[++i]; + else if (a === '--json') out.json = true; + else if (a === '--drop-lines-stdin') out.dropLinesStdin = true; + else if (a === '--help' || a === '-h') { + console.log(readFileSync(fileURLToPath(import.meta.url), 'utf8').split('\n').slice(0, 38).join('\n')); + process.exit(0); + } + } + return out; +} + +async function redisLrangeAll(url, token, key) { + const out = []; + const PAGE = 1000; + let start = 0; + while (true) { + const stop = start + PAGE - 1; + const res = await fetch(`${url}/lrange/${encodeURIComponent(key)}/${start}/${stop}`, { + headers: { Authorization: `Bearer ${token}` }, + }); + if (!res.ok) throw new Error(`LRANGE failed: HTTP ${res.status}`); + const body = await res.json(); + const items = Array.isArray(body?.result) ? body.result : []; + out.push(...items); + if (items.length < PAGE) break; + start += PAGE; + } + return out; +} + +async function redisMget(url, token, keys) { + if (keys.length === 0) return []; + const path = keys.map((k) => encodeURIComponent(k)).join('/'); + const res = await fetch(`${url}/mget/${path}`, { headers: { Authorization: `Bearer ${token}` } }); + if (!res.ok) throw new Error(`MGET failed: HTTP ${res.status}`); + const body = await res.json(); + return Array.isArray(body?.result) ? body.result : new Array(keys.length).fill(null); +} + +function loadLabels() { + const __dirname = dirname(fileURLToPath(import.meta.url)); + const raw = JSON.parse(readFileSync(resolve(__dirname, 'data', 'brief-adjacency-pairs.json'), 'utf8')); + return (raw.pairs ?? []).map((p) => ({ + a: normalizeForEmbedding(p.title_a), + b: normalizeForEmbedding(p.title_b), + expected: p.expected, + })); +} + +async function readStdinDropLines() { + if (process.stdin.isTTY) return []; + const chunks = []; + for await (const chunk of process.stdin) chunks.push(chunk); + return Buffer.concat(chunks).toString('utf8').split('\n').filter((l) => l.includes('brief filter drops')); +} + +function parseDropLine(line) { + // [digest] brief filter drops user=X sensitivity=Y variant=Z outcome=W in=N dropped_*=N out=N + const fields = {}; + for (const m of line.matchAll(/(\w+)=([^\s]+)/g)) fields[m[1]] = m[2]; + return fields; +} + +function summariseDropLines(lines) { + let in_total = 0, out_total = 0, cap_total = 0, samples = 0; + let shipped = 0, rejected = 0; + for (const line of lines) { + const f = parseDropLine(line); + if (!f.in || !f.out) continue; + in_total += Number(f.in); + out_total += Number(f.out); + cap_total += Number(f.dropped_cap ?? 0); + samples += 1; + if (f.outcome === 'shipped') shipped += 1; + else if (f.outcome === 'rejected') rejected += 1; + } + return { + samples, + shipped, + rejected, + cap_truncation_rate: in_total > 0 ? cap_total / in_total : 0, + avg_in: samples > 0 ? in_total / samples : 0, + avg_out: samples > 0 ? out_total / samples : 0, + }; +} + +// Mirror production: groupTopicsPostDedup operates on top-N reps after +// the score floor, not the raw 800-rep deduped pool. Read from env so +// a Railway DIGEST_SCORE_MIN / DIGEST_MAX_ITEMS flip stays in sync; +// fall back to documented defaults if env is empty/invalid. +const SCORE_FLOOR_DEFAULT = 63; +const TOP_N_DEFAULT = 30; +const MIN_SURVIVING_REPS = 5; + +function envInt(name, fallback) { + const raw = process.env[name]; + if (raw == null || raw === '') return fallback; + const n = Number.parseInt(raw, 10); + return Number.isFinite(n) && n > 0 ? n : fallback; +} +const SCORE_FLOOR = envInt('DIGEST_SCORE_MIN', SCORE_FLOOR_DEFAULT); +const TOP_N = envInt('DIGEST_MAX_ITEMS', TOP_N_DEFAULT); + +function scoreReplay({ records, embeddingByHash, labels, threshold }) { + // Reuse the latest tick's reps as the canonical "today's brief" sample. + const ticks = new Map(); + for (const r of records) { + if (!ticks.has(r.briefTickId)) ticks.set(r.briefTickId, []); + ticks.get(r.briefTickId).push(r); + } + const tickIds = [...ticks.keys()].sort(); + const latestTickId = tickIds[tickIds.length - 1]; + if (!latestTickId) return null; + const allReps = ticks.get(latestTickId).filter((r) => r.isRep); + if (allReps.length === 0) return null; + + // Apply floor + slice to mirror production. + const slicedReplay = allReps + .filter((r) => Number(r.currentScore ?? 0) >= SCORE_FLOOR) + .sort((a, b) => Number(b.currentScore ?? 0) - Number(a.currentScore ?? 0)) + .slice(0, TOP_N); + if (slicedReplay.length <= 1) return null; + + // Remap shape: replay uses storyHash/normalizedTitle; brief-dedup + // expects hash/title. Title carries the normalized form so labels + // match directly. Filter out reps whose embedding is missing from + // the cache (transient eviction); skip the tick only if too few + // reps survive. + const remapped = slicedReplay.map((r) => ({ + hash: r.storyHash, + title: r.normalizedTitle, + currentScore: r.currentScore, + })); + const sliced = remapped.filter((r) => Array.isArray(embeddingByHash.get(r.hash))); + const missingEmbedDrops = remapped.length - sliced.length; + if (sliced.length < MIN_SURVIVING_REPS) { + return { error: `only ${sliced.length} reps had cached embeddings (need ≥${MIN_SURVIVING_REPS}); ${missingEmbedDrops} dropped — re-run after cache warm-up` }; + } + const items = sliced.map((r) => ({ title: r.title, embedding: embeddingByHash.get(r.hash) })); + + // Direct single-link partition matches what production groupTopicsPostDedup does internally. + const { clusters } = singleLinkCluster(items, { cosineThreshold: threshold, vetoFn: null }); + + const topicOfIdx = new Array(sliced.length).fill(-1); + clusters.forEach((members, tIdx) => { for (const i of members) topicOfIdx[i] = tIdx; }); + + const titleToTopic = new Map(); + for (let i = 0; i < sliced.length; i++) titleToTopic.set(sliced[i].title, topicOfIdx[i]); + + const topicCount = clusters.length; + const sizes = clusters.map((c) => c.length); + + let cluster_total = 0, cluster_hit = 0, separate_total = 0, separate_violation = 0; + const violations = []; + for (const lab of labels) { + const tA = titleToTopic.get(lab.a); + const tB = titleToTopic.get(lab.b); + if (tA == null || tB == null) continue; + const clustered = tA === tB; + if (lab.expected === 'cluster') { + cluster_total += 1; + if (clustered) cluster_hit += 1; + else violations.push({ kind: 'missed_cluster', a: lab.a, b: lab.b }); + } else { + separate_total += 1; + if (clustered) { + separate_violation += 1; + violations.push({ kind: 'false_adjacency', a: lab.a, b: lab.b }); + } + } + } + + const pair_recall_cluster = cluster_total > 0 ? cluster_hit / cluster_total : 0; + const false_adjacency = separate_total > 0 ? separate_violation / separate_total : 0; + const multi_member = sizes.filter((x) => x > 1).length; + const multi_member_topic_share = topicCount > 0 ? multi_member / topicCount : 0; + + return { + tick_id: latestTickId, + rep_count: allReps.length, + sliced_rep_count: sliced.length, + missing_embed_drops: missingEmbedDrops, + score_floor: SCORE_FLOOR, + top_n: TOP_N, + topic_count: topicCount, + multi_member_topics: multi_member, + multi_member_topic_share, + pair_recall_cluster, + false_adjacency, + cluster_pairs_evaluated: cluster_total, + separate_pairs_evaluated: separate_total, + violations, + quality_score: pair_recall_cluster * 0.6 + (1 - false_adjacency) * 0.3 + multi_member_topic_share * 0.1, + }; +} + +function renderReport(out) { + const L = []; + L.push(`# Brief Quality Report — ${out.ctx.rule} on ${out.ctx.date}`); + L.push(''); + L.push(`Active topic threshold: ${out.ctx.threshold} (env DIGEST_DEDUP_TOPIC_THRESHOLD or default 0.45)`); + L.push(`Replay records: ${out.ctx.recordCount} across ${out.ctx.tickCount} ticks`); + L.push(''); + if (out.replay?.error) { + L.push('## Topic-grouping quality (latest tick)'); + L.push(''); + L.push(`⚠️ Could not score: ${out.replay.error}`); + L.push(''); + } else if (out.replay) { + L.push('## Topic-grouping quality (latest tick)'); + L.push(''); + L.push(`- **quality_score: ${out.replay.quality_score.toFixed(3)}** (target: ↑ over time)`); + L.push(`- pair_recall_cluster: ${(out.replay.pair_recall_cluster * 100).toFixed(1)}% (${out.replay.cluster_pairs_evaluated} labeled pairs evaluated)`); + L.push(`- false_adjacency: ${(out.replay.false_adjacency * 100).toFixed(1)}% (${out.replay.separate_pairs_evaluated} labeled pairs evaluated)`); + L.push(`- multi_member_topic_share: ${(out.replay.multi_member_topic_share * 100).toFixed(1)}% (${out.replay.multi_member_topics}/${out.replay.topic_count} topics)`); + L.push(`- topic_count: ${out.replay.topic_count} (from ${out.replay.sliced_rep_count} sliced reps; ${out.replay.rep_count} total in tick; floor=${out.replay.score_floor}, topN=${out.replay.top_n}${out.replay.missing_embed_drops > 0 ? `, ${out.replay.missing_embed_drops} reps dropped on missing embedding` : ''})`); + if (out.replay.violations?.length > 0) { + L.push(''); + L.push(' Violations vs labeled pairs:'); + for (const v of out.replay.violations) { + const arrow = v.kind === 'missed_cluster' ? '✗ should-cluster but separate' : '✗ should-separate but clustered'; + L.push(` ${arrow}: "${v.a.slice(0, 60)}…" ↔ "${v.b.slice(0, 60)}…"`); + } + } + L.push(''); + } + if (out.drops) { + L.push('## Production filter-drop telemetry (from stdin)'); + L.push(''); + L.push(`- samples: ${out.drops.samples} (shipped=${out.drops.shipped}, rejected=${out.drops.rejected})`); + L.push(`- avg in: ${out.drops.avg_in.toFixed(1)} stories/tick`); + L.push(`- avg out: ${out.drops.avg_out.toFixed(1)} stories/tick`); + L.push(`- **cap_truncation_rate: ${(out.drops.cap_truncation_rate * 100).toFixed(1)}%** (target: ↓ after cap bump)`); + L.push(''); + } + L.push('## Interpretation'); + L.push(''); + L.push('- Higher `quality_score` and `pair_recall_cluster`, lower `false_adjacency` and `cap_truncation_rate` = better.'); + L.push('- Run before each config change; compare deltas. If a change moves quality_score down, revert.'); + L.push('- Add labeled pairs to `scripts/data/brief-adjacency-pairs.json` whenever a brief surfaces an adjacency outcome that\'s clearly right or clearly wrong.'); + return L.join('\n'); +} + +async function main() { + const args = parseArgs(process.argv); + const { url, token } = getRedisCredentials(); + const replayKey = `${REPLAY_KEY_PREFIX}:${args.rule}:${args.date}`; + + const rawList = await redisLrangeAll(url, token, replayKey); + const records = rawList.map((s) => { try { return JSON.parse(s); } catch { return null; } }).filter(Boolean); + if (records.length === 0) { + console.error(`No replay records at ${replayKey}.`); + process.exit(2); + } + const tickIds = new Set(records.map((r) => r.briefTickId)); + + // Load embeddings for the latest tick only (the dashboard only scores + // the latest snapshot — earlier ticks are the sweep harness's job). + const sortedTickIds = [...tickIds].sort(); + const latestTickId = sortedTickIds[sortedTickIds.length - 1]; + const latestRecords = records.filter((r) => r.briefTickId === latestTickId); + const reps = latestRecords.filter((r) => r.isRep); + const cacheKeys = [...new Set(reps.map((r) => r.embeddingCacheKey).filter(Boolean))]; + const CHUNK = 50; + const embByCacheKey = new Map(); + for (let i = 0; i < cacheKeys.length; i += CHUNK) { + const chunk = cacheKeys.slice(i, i + CHUNK); + const vals = await redisMget(url, token, chunk); + for (let j = 0; j < chunk.length; j++) { + if (typeof vals[j] !== 'string') continue; + try { const v = JSON.parse(vals[j]); if (Array.isArray(v)) embByCacheKey.set(chunk[j], v); } catch { /* skip */ } + } + } + const embeddingByHash = new Map(); + for (const r of reps) { + const v = embByCacheKey.get(r.embeddingCacheKey); + if (Array.isArray(v)) embeddingByHash.set(r.storyHash, v); + } + + // Active threshold: read from latest tickConfig, else default 0.45. + const threshold = latestRecords[0]?.tickConfig?.topicThreshold ?? 0.45; + const labels = loadLabels(); + + // Always call scoreReplay when there are reps. The function itself + // filters missing embeddings and returns { error: '…' } if too few + // survive (MIN_SURVIVING_REPS guard); renderReport surfaces that + // error path with a ⚠️ warning. Gating here on + // `embeddingByHash.size === reps.length` was defeating the + // intended graceful-degradation behaviour — Greptile P2 on PR #3390. + const replay = reps.length > 0 + ? scoreReplay({ records: latestRecords, embeddingByHash, labels, threshold }) + : null; + + const dropLines = args.dropLinesStdin ? await readStdinDropLines() : []; + const drops = dropLines.length > 0 ? summariseDropLines(dropLines) : null; + + const out = { + ctx: { rule: args.rule, date: args.date, threshold, recordCount: records.length, tickCount: tickIds.size }, + replay, + drops, + }; + + if (args.json) { + console.log(JSON.stringify(out, null, 2)); + } else { + console.log(renderReport(out)); + } +} + +main().catch((err) => { + console.error(`brief-quality-report: ${err?.stack ?? err?.message ?? String(err)}`); + process.exit(1); +}); diff --git a/scripts/data/brief-adjacency-pairs.json b/scripts/data/brief-adjacency-pairs.json new file mode 100644 index 000000000..b9f94aaa5 --- /dev/null +++ b/scripts/data/brief-adjacency-pairs.json @@ -0,0 +1,102 @@ +{ + "meta": { + "doc": "Labeled adjacency pairs for brief topic-grouping evaluation. Read by scripts/sweep-topic-thresholds.mjs and scripts/brief-quality-report.mjs. Each pair carries two normalized headlines and an expected verdict ('cluster' or 'separate'). The harness matches by normalizeForEmbedding(title) against replay-log records, so titles must match what was actually embedded (apply the same wire-suffix-stripping as scripts/lib/brief-embedding.mjs:normalizeForEmbedding).", + "schema_version": 1, + "observed_briefs": [ + "2026-04-24-2001 — first brief that surfaced the adjacency complaint (Iran-Iran-Iran split)", + "2026-04-25-0802 — first brief after PR #3387 merged; Iran cluster of size 6 plus 3 split-off Iran subtopics" + ], + "labeling_guidance": [ + "cluster — these two stories cover the SAME news event or share so much narrative context that a reader would expect them to be adjacent in a daily brief.", + "separate — these two stories cover unrelated news events; clustering them would surface as a false adjacency to the reader.", + "Aim for a balanced ratio. 1:1 cluster:separate is healthy. Heavy-skew toward 'cluster' inflates recall numbers without measuring false-adjacency.", + "Add new pairs whenever a brief surfaces an adjacency outcome that was clearly right or clearly wrong. Treat this file as an append-only labeled corpus." + ] + }, + "pairs": [ + { + "title_a": "Pentagon chief Hegseth says US blockade on Iran 'going global'", + "title_b": "US Navy secretary fired amid Iran blockade", + "expected": "cluster", + "rationale": "Same event narrative — Hegseth + US Navy + Iran blockade. The 2026-04-24 brief had these adjacent; that adjacency should be preserved.", + "source_brief": "2026-04-24-2001" + }, + { + "title_a": "Pentagon chief Hegseth says US blockade on Iran 'going global'", + "title_b": "US-Iran War Live Updates: Another US Aircraft Carrier Reaches Gulf, Joins 2 Already Operating There", + "expected": "cluster", + "rationale": "Same news cycle — US military escalation against Iran. Both name US naval/military assets and Iran. The 2026-04-24 brief split them (positions 1 vs 7), driving the original user complaint.", + "source_brief": "2026-04-24-2001" + }, + { + "title_a": "36 serving military officers to be arraigned for coup plot against President Tinubu", + "title_b": "Behind Nigeria's murky coup plot — the money, the prayers and a Nollywood arrest", + "expected": "cluster", + "rationale": "Same event — Nigeria's coup plot. President Tinubu IS Nigeria. The 2026-04-24 brief split them across positions 4 and 6 with movie junk in between.", + "source_brief": "2026-04-24-2001" + }, + { + "title_a": "Pentagon chief Hegseth says US blockade on Iran 'going global'", + "title_b": "Navy Secretary John Phelan fired amid tensions with Pete Hegseth as Iran war rages", + "expected": "cluster", + "rationale": "Same actor (Hegseth) + same news cycle (Iran war / US firings). The 2026-04-25 morning brief still split these (positions 1 vs 4) — proves the threshold-too-tight defect persists.", + "source_brief": "2026-04-25-0802" + }, + { + "title_a": "Israel-Iran war LIVE: Iran's supreme leader Mojtaba Khamenei 'gravely wounded': NYT", + "title_b": "Iran war leaves seafarers stranded in the Gulf", + "expected": "cluster", + "rationale": "Both are Iran-war effect stories. Reasonable expectation that they appear adjacent in a daily brief about the same conflict.", + "source_brief": "2026-04-25-0802" + }, + { + "title_a": "Iran war leaves seafarers stranded in the Gulf", + "title_b": "UAE pupils seek university places after exams scrapped due to Iran war", + "expected": "cluster", + "rationale": "Both are Iran-war regional-effect stories from Gulf countries. The 2026-04-25 brief split them across positions 8 and 11.", + "source_brief": "2026-04-25-0802" + }, + { + "title_a": "ECB interest rate dilemma: Eurozone growth stalls as Iran war fuels inflation", + "title_b": "Iran war leaves seafarers stranded in the Gulf", + "expected": "cluster", + "rationale": "Both are Iran-war economic-effect stories. Different geographic angle (Europe vs Gulf) but same root narrative.", + "source_brief": "2026-04-25-0802" + }, + { + "title_a": "Pentagon chief Hegseth says US blockade on Iran 'going global'", + "title_b": "Average of 47 women and girls killed daily between 2023-2025 during Israel's war in Gaza, UN says", + "expected": "separate", + "rationale": "Different conflicts (US-Iran military vs Israel-Gaza humanitarian). Clustering would create a false adjacency that conflates two distinct news threads.", + "source_brief": "2026-04-24-2001" + }, + { + "title_a": "Pentagon chief Hegseth says US blockade on Iran 'going global'", + "title_b": "Armenia marks 111th anniversary of Armenian genocide in Yerevan", + "expected": "separate", + "rationale": "Different topics, different geographies, different news cycles. Should NEVER cluster.", + "source_brief": "2026-04-24-2001" + }, + { + "title_a": "Pentagon chief Hegseth says US blockade on Iran 'going global'", + "title_b": "Could the US military handle a monster invasion? Monarch: Legacy of Monsters begs the question", + "expected": "separate", + "rationale": "Anti-pair. The Apple TV+ show review should never cluster with real geopolitical news. (Separate classifier-hardening problem; included here as a hard-negative to keep the false-adjacency metric honest.)", + "source_brief": "2026-04-24-2001" + }, + { + "title_a": "Israel-Iran war LIVE: Iran's supreme leader Mojtaba Khamenei 'gravely wounded': NYT", + "title_b": "Syrian authorities arrest main suspect in 2013 Tadamon massacre", + "expected": "separate", + "rationale": "Both Middle East but different countries, different events, different news cycles (Iran-war today vs 12-year-old Syrian war crime).", + "source_brief": "2026-04-25-0802" + }, + { + "title_a": "Pentagon chief Hegseth says US blockade on Iran 'going global'", + "title_b": "How Israel targeted and killed Lebanese journalist Amal Khalil", + "expected": "separate", + "rationale": "Different conflicts, different perpetrators, different victims. Should NOT cluster even though both are Middle East / military.", + "source_brief": "2026-04-25-0802" + } + ] +} diff --git a/scripts/sweep-topic-thresholds.mjs b/scripts/sweep-topic-thresholds.mjs new file mode 100644 index 000000000..aced982ce --- /dev/null +++ b/scripts/sweep-topic-thresholds.mjs @@ -0,0 +1,537 @@ +#!/usr/bin/env node +// Offline threshold sweep for the brief topic-grouping pass. +// +// Reads the per-tick replay log captured by writeReplayLog (opt-in via +// DIGEST_DEDUP_REPLAY_LOG=1, key prefix `digest:replay-log:v1:`), +// reconstructs each tick's reps + cached embeddings, re-runs +// groupTopicsPostDedup at multiple cosine thresholds, and scores the +// resulting topic assignments against the labeled adjacency pairs in +// scripts/data/brief-adjacency-pairs.json. +// +// "Are we getting better" output: a markdown table — one row per +// candidate threshold — with pair_recall, false_adjacency, topic_count, +// avg_topic_size, and a composite quality_score. Pick the row with the +// highest quality_score; flip DIGEST_DEDUP_TOPIC_THRESHOLD on Railway +// to that value. +// +// Usage: +// node --import tsx/esm scripts/sweep-topic-thresholds.mjs # today, full:en:all +// node --import tsx/esm scripts/sweep-topic-thresholds.mjs --date 2026-04-24 # specific date +// node --import tsx/esm scripts/sweep-topic-thresholds.mjs --rule full:en:critical # specific rule +// node --import tsx/esm scripts/sweep-topic-thresholds.mjs --thresholds 0.30,0.35,0.40 # custom sweep +// node --import tsx/esm scripts/sweep-topic-thresholds.mjs --json > sweep-result.json # machine-readable + +import { readFileSync } from 'node:fs'; +import { resolve, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { loadEnvFile, getRedisCredentials } from './_seed-utils.mjs'; +import { singleLinkCluster } from './lib/brief-dedup-embed.mjs'; +import { normalizeForEmbedding } from './lib/brief-embedding.mjs'; + +loadEnvFile(import.meta.url); + +// ── CLI args ─────────────────────────────────────────────────────────── + +// Resolve floor + cap + topN from production env, falling back to +// documented defaults. CLI flags override env. The replay log's +// tickConfig does not currently capture these (see PR #3390 follow-up +// to add scoreFloor/topN/maxStoriesPerUser to the writer's record); +// until then, env is the most-faithful source. +const SCORE_FLOOR_DEFAULT = 63; // matches production DIGEST_SCORE_MIN +const TOP_N_DEFAULT = 30; // matches production DIGEST_MAX_ITEMS +// Default 12 — matches production MAX_STORIES_PER_USER. PR #3389 kept +// the historical default after sweep evidence showed cap=16 hurts +// visible_quality at threshold 0.45. Override locally with +// DIGEST_MAX_STORIES_PER_USER env var or `--cap N` flag. +const MAX_STORIES_DEFAULT = 12; + +function envInt(name, fallback) { + const raw = process.env[name]; + if (raw == null || raw === '') return fallback; + const n = Number.parseInt(raw, 10); + return Number.isFinite(n) && n > 0 ? n : fallback; +} + +function parseArgs(argv) { + const out = { + date: new Date().toISOString().slice(0, 10), + rule: 'full:en:all', + thresholds: [0.30, 0.32, 0.35, 0.38, 0.40, 0.42, 0.45], + scoreFloor: envInt('DIGEST_SCORE_MIN', SCORE_FLOOR_DEFAULT), + topN: envInt('DIGEST_MAX_ITEMS', TOP_N_DEFAULT), + maxStoriesPerUser: envInt('DIGEST_MAX_STORIES_PER_USER', MAX_STORIES_DEFAULT), + json: false, + }; + for (let i = 2; i < argv.length; i++) { + const a = argv[i]; + if (a === '--date') out.date = argv[++i]; + else if (a === '--rule') out.rule = argv[++i]; + else if (a === '--thresholds') { + out.thresholds = argv[++i].split(',').map((x) => Number(x.trim())).filter(Number.isFinite); + } else if (a === '--score-floor') out.scoreFloor = Number(argv[++i]); + else if (a === '--top-n') out.topN = Number(argv[++i]); + else if (a === '--max-stories' || a === '--cap') out.maxStoriesPerUser = Number(argv[++i]); + else if (a === '--json') out.json = true; + else if (a === '--help' || a === '-h') { + console.log(readFileSync(fileURLToPath(import.meta.url), 'utf8').split('\n').slice(0, 23).join('\n')); + process.exit(0); + } + } + return out; +} + +// ── Redis helpers ─────────────────────────────────────────────────────── + +const REPLAY_KEY_PREFIX = 'digest:replay-log:v1'; + +async function redisLrangeAll(url, token, key) { + // Pull entire list. Page size 1000 to keep individual responses bounded. + const out = []; + const PAGE = 1000; + let start = 0; + while (true) { + const stop = start + PAGE - 1; + const res = await fetch(`${url}/lrange/${encodeURIComponent(key)}/${start}/${stop}`, { + headers: { Authorization: `Bearer ${token}` }, + }); + if (!res.ok) { + throw new Error(`LRANGE failed: HTTP ${res.status} ${await res.text()}`); + } + const body = await res.json(); + const items = Array.isArray(body?.result) ? body.result : []; + out.push(...items); + if (items.length < PAGE) break; + start += PAGE; + } + return out; +} + +async function redisMget(url, token, keys) { + // Upstash MGET via REST. Returns array same length as keys; null for missing. + if (keys.length === 0) return []; + const path = keys.map((k) => encodeURIComponent(k)).join('/'); + const res = await fetch(`${url}/mget/${path}`, { + headers: { Authorization: `Bearer ${token}` }, + }); + if (!res.ok) { + throw new Error(`MGET failed: HTTP ${res.status} ${await res.text()}`); + } + const body = await res.json(); + return Array.isArray(body?.result) ? body.result : new Array(keys.length).fill(null); +} + +// ── Replay record helpers ─────────────────────────────────────────────── + +function parseReplayRecords(rawList) { + const recs = []; + for (const raw of rawList) { + if (typeof raw !== 'string') continue; + try { + const r = JSON.parse(raw); + if (r && typeof r === 'object' && r.briefTickId) recs.push(r); + } catch { /* swallow malformed entries */ } + } + return recs; +} + +function groupByTick(records) { + const ticks = new Map(); + for (const r of records) { + if (!ticks.has(r.briefTickId)) ticks.set(r.briefTickId, []); + ticks.get(r.briefTickId).push(r); + } + return ticks; +} + +// ── Pair labels ───────────────────────────────────────────────────────── + +function loadLabeledPairs() { + const __dirname = dirname(fileURLToPath(import.meta.url)); + const path = resolve(__dirname, 'data', 'brief-adjacency-pairs.json'); + const raw = JSON.parse(readFileSync(path, 'utf8')); + return Array.isArray(raw?.pairs) ? raw.pairs : []; +} + +// Apply normalizeForEmbedding to each label so titles match what was +// actually embedded in the replay log. +function indexLabelsByNormalizedTitle(pairs) { + const out = []; + for (const p of pairs) { + if (!p.title_a || !p.title_b) continue; + out.push({ + a: normalizeForEmbedding(p.title_a), + b: normalizeForEmbedding(p.title_b), + expected: p.expected, + rationale: p.rationale, + source_brief: p.source_brief, + }); + } + return out; +} + +// ── Threshold scoring ─────────────────────────────────────────────────── + +// Mirror the production slice: groupTopicsPostDedup runs on the +// top-DIGEST_MAX_ITEMS reps by score, NOT the full deduped set. +// scripts/seed-digest-notifications.mjs:479 — `deduped.slice(0, 30)`. +const MIN_SURVIVING_REPS = 5; // skip ticks with fewer hydrated reps + +function scoreOneTick({ reps, embeddingByHash, labels, thresholds, scoreFloor, topN, maxStoriesPerUser, missingEmbedReporter }) { + // Apply production-equivalent floor + slice so the sweep reflects + // what topic-grouping actually sees in prod, not the 800-rep raw pool. + const floored = reps.filter((r) => Number(r.currentScore ?? 0) >= scoreFloor); + const slicedReplay = [...floored] + .sort((a, b) => Number(b.currentScore ?? 0) - Number(a.currentScore ?? 0)) + .slice(0, topN); + if (slicedReplay.length <= 1) { + return thresholds.map((t) => ({ threshold: t, topic_count: slicedReplay.length, sizes: [], pair_results: [], pair_results_visible: [] })); + } + + // Remap replay-record shape (storyHash, normalizedTitle, …) to the + // shape brief-dedup expects (hash, title, currentScore). Filter out + // reps whose embedding is missing from the cache (transient eviction + // or a rep written before the cache was populated). Skip the tick + // entirely if too few reps survive. + const remapped = slicedReplay.map((r) => ({ + hash: r.storyHash, + title: r.normalizedTitle, + currentScore: r.currentScore, + })); + const survivors = remapped.filter((r) => Array.isArray(embeddingByHash.get(r.hash))); + const dropped = remapped.length - survivors.length; + if (dropped > 0 && missingEmbedReporter) missingEmbedReporter(dropped); + if (survivors.length < MIN_SURVIVING_REPS) return null; + const sliced = survivors; + + const out = []; + for (const threshold of thresholds) { + // Run the same single-link cluster groupTopicsPostDedup uses + // internally. We compute the partition directly so the + // topic-membership labels are byte-identical to what production + // would produce at this threshold (no leader-only approximation). + const items = sliced.map((r) => ({ + title: r.title, + embedding: embeddingByHash.get(r.hash), + })); + const { clusters } = singleLinkCluster(items, { cosineThreshold: threshold, vetoFn: null }); + + // Map sliced index → topicId + const topicOfIdx = new Array(sliced.length).fill(-1); + clusters.forEach((members, tIdx) => { + for (const i of members) topicOfIdx[i] = tIdx; + }); + + // Title → topic membership for label scoring + const titleToTopic = new Map(); + for (let i = 0; i < sliced.length; i++) titleToTopic.set(sliced[i].title, topicOfIdx[i]); + + const topicCount = clusters.length; + const sizes = clusters.map((c) => c.length); + // singleLinkCluster IS the partition algorithm groupTopicsPostDedup + // uses internally (scripts/lib/brief-dedup.mjs:336 — clusterFn + // defaults to singleLinkCluster). No second pass needed; we get + // the same partition production would compute, faithfully. + + // Reproduce groupTopicsPostDedup's ordering so we can answer the + // cap-related question: which members survive the post-cluster + // top-N truncation? Order = topics by (size DESC, max-score DESC), + // members within a topic by (score DESC). Tiebreaks are + // deterministic by input order — close enough for evaluation. + const topicMaxScore = clusters.map((members) => + Math.max(...members.map((i) => Number(sliced[i].currentScore ?? 0))), + ); + const topicOrder = [...clusters.keys()].sort((a, b) => { + if (sizes[a] !== sizes[b]) return sizes[b] - sizes[a]; + return topicMaxScore[b] - topicMaxScore[a]; + }); + const orderedIdx = []; + for (const tIdx of topicOrder) { + const members = [...clusters[tIdx]].sort( + (a, b) => Number(sliced[b].currentScore ?? 0) - Number(sliced[a].currentScore ?? 0), + ); + orderedIdx.push(...members); + } + const visibleIdxSet = new Set(orderedIdx.slice(0, maxStoriesPerUser)); + // Title → sliced index, for visibility lookup + const titleToIdx = new Map(); + for (let i = 0; i < sliced.length; i++) titleToIdx.set(sliced[i].title, i); + + const pair_results = []; + const pair_results_visible = []; + for (const lab of labels) { + const tA = titleToTopic.get(lab.a); + const tB = titleToTopic.get(lab.b); + if (tA == null || tB == null) continue; // pair not present in this tick + const clustered = tA === tB; + pair_results.push({ expected: lab.expected, clustered }); + + // Visible-window evaluation: did BOTH labeled stories survive + // the post-cluster top-N truncation? This is what users actually + // see. Drives the cap-bump validation question (PR #3389): + // does bumping cap=12 → 16 cause more cluster-pairs to land + // visibly adjacent? + const iA = titleToIdx.get(lab.a); + const iB = titleToIdx.get(lab.b); + if (visibleIdxSet.has(iA) && visibleIdxSet.has(iB)) { + pair_results_visible.push({ expected: lab.expected, clustered }); + } + } + + out.push({ + threshold, + topic_count: topicCount, + sizes: [...sizes].sort((a, b) => b - a), + pair_results, + pair_results_visible, + visible_count: Math.min(orderedIdx.length, maxStoriesPerUser), + }); + } + return out; +} + + +// ── Aggregation across ticks ──────────────────────────────────────────── + +function aggregateByThreshold(perTickRows, thresholds) { + const summary = new Map(); + for (const t of thresholds) summary.set(t, { + threshold: t, + ticks: 0, + avg_topic_count: 0, + avg_max_topic_size: 0, + avg_visible_count: 0, + multi_member_topic_share: 0, + pair_recall_cluster: 0, // partition-only (whole tick) + false_adjacency: 0, // partition-only (whole tick) + pair_recall_visible: 0, // both members visible AND clustered + false_adjacency_visible: 0, // both members visible AND clustered (separate-labeled) + quality_score: 0, + visible_quality_score: 0, + samples: 0, + visible_samples: 0, + }); + for (const tickRows of perTickRows) { + if (!tickRows) continue; + for (const row of tickRows) { + const s = summary.get(row.threshold); + if (!s) continue; + s.ticks += 1; + s.avg_topic_count += row.topic_count; + s.avg_max_topic_size += row.sizes[0] ?? 0; + s.avg_visible_count += row.visible_count ?? 0; + const multiMember = row.sizes.filter((x) => x > 1).length; + s.multi_member_topic_share += row.topic_count > 0 ? multiMember / row.topic_count : 0; + for (const p of row.pair_results) { + if (p.expected === 'cluster') { + s.pair_recall_cluster += p.clustered ? 1 : 0; + s._cluster_total = (s._cluster_total ?? 0) + 1; + } else { + s.false_adjacency += p.clustered ? 1 : 0; + s._separate_total = (s._separate_total ?? 0) + 1; + } + s.samples += 1; + } + for (const p of (row.pair_results_visible ?? [])) { + if (p.expected === 'cluster') { + s.pair_recall_visible += p.clustered ? 1 : 0; + s._cluster_total_visible = (s._cluster_total_visible ?? 0) + 1; + } else { + s.false_adjacency_visible += p.clustered ? 1 : 0; + s._separate_total_visible = (s._separate_total_visible ?? 0) + 1; + } + s.visible_samples += 1; + } + } + } + for (const s of summary.values()) { + if (s.ticks === 0) continue; + s.avg_topic_count /= s.ticks; + s.avg_max_topic_size /= s.ticks; + s.avg_visible_count /= s.ticks; + s.multi_member_topic_share /= s.ticks; + s.pair_recall_cluster = (s._cluster_total ?? 0) > 0 ? s.pair_recall_cluster / s._cluster_total : 0; + s.false_adjacency = (s._separate_total ?? 0) > 0 ? s.false_adjacency / s._separate_total : 0; + s.pair_recall_visible = (s._cluster_total_visible ?? 0) > 0 ? s.pair_recall_visible / s._cluster_total_visible : 0; + s.false_adjacency_visible = (s._separate_total_visible ?? 0) > 0 ? s.false_adjacency_visible / s._separate_total_visible : 0; + // Composite: weight visible recall (what users actually see), + // penalise visible false adjacency, small bonus for multi-member + // share. The visible variant is the deployment metric — it answers + // "does this config produce a better brief?" rather than "does it + // produce a better partition?" + s.quality_score = ( + s.pair_recall_cluster * 0.6 + + (1 - s.false_adjacency) * 0.3 + + s.multi_member_topic_share * 0.1 + ); + s.visible_quality_score = ( + s.pair_recall_visible * 0.6 + + (1 - s.false_adjacency_visible) * 0.3 + + s.multi_member_topic_share * 0.1 + ); + delete s._cluster_total; + delete s._separate_total; + delete s._cluster_total_visible; + delete s._separate_total_visible; + } + return [...summary.values()].sort((a, b) => a.threshold - b.threshold); +} + +// ── Output formatters ─────────────────────────────────────────────────── + +function renderMarkdownTable(rows, ctx) { + const lines = []; + lines.push(`# Brief topic-threshold sweep — ${ctx.rule} on ${ctx.date}`); + lines.push(''); + lines.push(`Replay records: ${ctx.recordCount}, ticks: ${ctx.tickCount}, evaluable ticks: ${ctx.evaluableTicks}`); + lines.push(`Labeled pairs loaded: ${ctx.labelCount} (${ctx.clusterLabels} cluster, ${ctx.separateLabels} separate)`); + lines.push(`Production-equivalent slice: scoreFloor=${ctx.scoreFloor}, topN=${ctx.topN}, maxStoriesPerUser (cap)=${ctx.maxStoriesPerUser}`); + if (ctx.missingEmbedDrops > 0) { + lines.push(`Reps dropped due to missing cached embeddings: ${ctx.missingEmbedDrops} (across all ticks)`); + } + lines.push(''); + lines.push('Visible-window metrics measure what ends up in the user-visible top-N brief AFTER cap-truncation.'); + lines.push('Partition metrics measure cluster correctness ignoring the cap.'); + lines.push(''); + lines.push('| threshold | visible_quality | visible_recall | visible_false_adj | partition_quality | partition_recall | partition_false_adj | avg_topics | multi_share | visible_samples / partition_samples |'); + lines.push('|-----------|-----------------|----------------|-------------------|-------------------|------------------|---------------------|------------|-------------|-------------------------------------|'); + // Compute the GLOBAL best in a first pass so the ⭐ marker only + // tags one row. The previous one-pass approach starred every row + // that was the running best at the time it was rendered (Greptile + // P1 on PR #3390). + let best = null; + for (const r of rows) { + if (r.ticks === 0) continue; + if (best == null || r.visible_quality_score > best.visible_quality_score) best = r; + } + for (const r of rows) { + if (r.ticks === 0) continue; + const star = (r === best) ? ' ⭐' : ''; + lines.push( + `| ${r.threshold.toFixed(2)} ` + + `| ${r.visible_quality_score.toFixed(3)}${star} ` + + `| ${(r.pair_recall_visible * 100).toFixed(1)}% ` + + `| ${(r.false_adjacency_visible * 100).toFixed(1)}% ` + + `| ${r.quality_score.toFixed(3)} ` + + `| ${(r.pair_recall_cluster * 100).toFixed(1)}% ` + + `| ${(r.false_adjacency * 100).toFixed(1)}% ` + + `| ${r.avg_topic_count.toFixed(1)} ` + + `| ${(r.multi_member_topic_share * 100).toFixed(1)}% ` + + `| ${r.visible_samples} / ${r.samples} |`, + ); + } + if (best) { + lines.push(''); + lines.push(`**Recommended threshold: ${best.threshold.toFixed(2)}** (visible_quality=${best.visible_quality_score.toFixed(3)}, visible_recall=${(best.pair_recall_visible*100).toFixed(1)}%, visible_false_adj=${(best.false_adjacency_visible*100).toFixed(1)}%)`); + lines.push(''); + lines.push(`Apply via Railway env on the **scripts-cron-digest-notifications** service:`); + lines.push(` \`DIGEST_DEDUP_TOPIC_THRESHOLD=${best.threshold.toFixed(2)}\``); + lines.push(''); + lines.push('To compare cap values, re-run with `--cap 12` and `--cap 16`. The `visible_*` columns will diverge if cap-truncation is materially affecting topic adjacency.'); + } + return lines.join('\n'); +} + +// ── Main ──────────────────────────────────────────────────────────────── + +async function main() { + const args = parseArgs(process.argv); + const { url, token } = getRedisCredentials(); + const replayKey = `${REPLAY_KEY_PREFIX}:${args.rule}:${args.date}`; + + const rawList = await redisLrangeAll(url, token, replayKey); + const records = parseReplayRecords(rawList); + if (records.length === 0) { + console.error(`No replay records at ${replayKey}. Is DIGEST_DEDUP_REPLAY_LOG=1 set on Railway?`); + process.exit(2); + } + + const ticks = groupByTick(records); + + // For each tick: reps = records where isRep===true. Hydrate embeddings + // via MGET on embeddingCacheKey. + const allCacheKeys = new Set(); + for (const tickRecs of ticks.values()) { + for (const r of tickRecs) { + if (r.isRep && r.embeddingCacheKey) allCacheKeys.add(r.embeddingCacheKey); + } + } + const cacheKeyList = [...allCacheKeys]; + // Chunk MGET to keep URL length sane (Upstash REST has practical caps). + const CHUNK = 50; + const embeddingByCacheKey = new Map(); + for (let i = 0; i < cacheKeyList.length; i += CHUNK) { + const chunk = cacheKeyList.slice(i, i + CHUNK); + const vals = await redisMget(url, token, chunk); + for (let j = 0; j < chunk.length; j++) { + if (typeof vals[j] !== 'string') continue; + try { + const vec = JSON.parse(vals[j]); + if (Array.isArray(vec) && vec.length > 0) embeddingByCacheKey.set(chunk[j], vec); + } catch { /* skip malformed */ } + } + } + + const labels = indexLabelsByNormalizedTitle(loadLabeledPairs()); + const clusterLabels = labels.filter((l) => l.expected === 'cluster').length; + const separateLabels = labels.length - clusterLabels; + + // Score each tick at all thresholds. Reps with missing embeddings + // are filtered inside scoreOneTick (D fix); a tick is skipped only + // if too few reps survive (< MIN_SURVIVING_REPS). + const perTick = []; + let evaluable = 0; + let missingEmbedDrops = 0; + const reportMissing = (n) => { missingEmbedDrops += n; }; + for (const tickRecs of ticks.values()) { + const reps = tickRecs.filter((r) => r.isRep); + if (reps.length === 0) { perTick.push(null); continue; } + const embeddingByHash = new Map(); + for (const r of reps) { + const vec = embeddingByCacheKey.get(r.embeddingCacheKey); + if (Array.isArray(vec)) embeddingByHash.set(r.storyHash, vec); + } + const tickRows = scoreOneTick({ + reps, + embeddingByHash, + labels, + thresholds: args.thresholds, + scoreFloor: args.scoreFloor, + topN: args.topN, + maxStoriesPerUser: args.maxStoriesPerUser, + missingEmbedReporter: reportMissing, + }); + if (tickRows) { + perTick.push(tickRows); + evaluable += 1; + } else { + perTick.push(null); + } + } + + const rows = aggregateByThreshold(perTick, args.thresholds); + const ctx = { + rule: args.rule, + date: args.date, + recordCount: records.length, + tickCount: ticks.size, + evaluableTicks: evaluable, + labelCount: labels.length, + clusterLabels, + separateLabels, + scoreFloor: args.scoreFloor, + topN: args.topN, + maxStoriesPerUser: args.maxStoriesPerUser, + missingEmbedDrops, + }; + + if (args.json) { + console.log(JSON.stringify({ ctx, rows }, null, 2)); + } else { + console.log(renderMarkdownTable(rows, ctx)); + } +} + +main().catch((err) => { + console.error(`sweep-topic-thresholds: ${err?.stack ?? err?.message ?? String(err)}`); + process.exit(1); +});