mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
Move the heavy AI insights pipeline (clustering, scoring, LLM brief) from client-side (15-40s per user) to a 5-min Railway cron job. The frontend reads pre-computed insights instantly via bootstrap hydration, with graceful fallback to the existing client-side pipeline. - Add _clustering.mjs: Jaccard clustering + importance scoring (pure JS) - Add seed-insights.mjs: Railway cron reads digest, clusters, calls Groq/OpenRouter for brief, writes to Redis with LKG preservation - Register insights key in bootstrap.js FAST_KEYS tier - Add insights-loader.ts: module-level cached bootstrap reader - Modify InsightsPanel.ts: server-first path (2-step progress) with client fallback (4-step, unchanged behavior) - Add unit tests for clustering (12) and insights-loader (7)
198 lines
6.1 KiB
JavaScript
198 lines
6.1 KiB
JavaScript
#!/usr/bin/env node
|
|
|
|
const SIMILARITY_THRESHOLD = 0.5;
|
|
|
|
const STOP_WORDS = new Set([
|
|
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'been',
|
|
'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
|
|
'could', 'should', 'may', 'might', 'must', 'shall', 'can', 'need',
|
|
'it', 'its', 'this', 'that', 'these', 'those', 'i', 'you', 'he',
|
|
'she', 'we', 'they', 'what', 'which', 'who', 'whom', 'how', 'when',
|
|
'where', 'why', 'all', 'each', 'every', 'both', 'few', 'more', 'most',
|
|
'other', 'some', 'such', 'no', 'not', 'only', 'same', 'so', 'than',
|
|
'too', 'very', 'just', 'also', 'now', 'new', 'says', 'said', 'after',
|
|
]);
|
|
|
|
const MILITARY_KEYWORDS = [
|
|
'war', 'armada', 'invasion', 'airstrike', 'strike', 'missile', 'troops',
|
|
'deployed', 'offensive', 'artillery', 'bomb', 'combat', 'fleet', 'warship',
|
|
'carrier', 'navy', 'airforce', 'deployment', 'mobilization', 'attack',
|
|
];
|
|
|
|
const VIOLENCE_KEYWORDS = [
|
|
'killed', 'dead', 'death', 'shot', 'blood', 'massacre', 'slaughter',
|
|
'fatalities', 'casualties', 'wounded', 'injured', 'murdered', 'execution',
|
|
'crackdown', 'violent', 'clashes', 'gunfire', 'shooting',
|
|
];
|
|
|
|
const UNREST_KEYWORDS = [
|
|
'protest', 'protests', 'uprising', 'revolt', 'revolution', 'riot', 'riots',
|
|
'demonstration', 'unrest', 'dissent', 'rebellion', 'insurgent', 'overthrow',
|
|
'coup', 'martial law', 'curfew', 'shutdown', 'blackout',
|
|
];
|
|
|
|
const FLASHPOINT_KEYWORDS = [
|
|
'iran', 'tehran', 'russia', 'moscow', 'china', 'beijing', 'taiwan', 'ukraine', 'kyiv',
|
|
'north korea', 'pyongyang', 'israel', 'gaza', 'west bank', 'syria', 'damascus',
|
|
'yemen', 'hezbollah', 'hamas', 'kremlin', 'pentagon', 'nato', 'wagner',
|
|
];
|
|
|
|
const CRISIS_KEYWORDS = [
|
|
'crisis', 'emergency', 'catastrophe', 'disaster', 'collapse', 'humanitarian',
|
|
'sanctions', 'ultimatum', 'threat', 'retaliation', 'escalation', 'tensions',
|
|
'breaking', 'urgent', 'developing', 'exclusive',
|
|
];
|
|
|
|
const DEMOTE_KEYWORDS = [
|
|
'ceo', 'earnings', 'stock', 'startup', 'data center', 'datacenter', 'revenue',
|
|
'quarterly', 'profit', 'investor', 'ipo', 'funding', 'valuation',
|
|
];
|
|
|
|
function tokenize(text) {
|
|
const words = text
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9\s]/g, ' ')
|
|
.split(/\s+/)
|
|
.filter(w => w.length > 2 && !STOP_WORDS.has(w));
|
|
return new Set(words);
|
|
}
|
|
|
|
function jaccardSimilarity(a, b) {
|
|
if (a.size === 0 && b.size === 0) return 0;
|
|
let intersection = 0;
|
|
for (const x of a) {
|
|
if (b.has(x)) intersection++;
|
|
}
|
|
const union = a.size + b.size - intersection;
|
|
return intersection / union;
|
|
}
|
|
|
|
export function clusterItems(items) {
|
|
if (items.length === 0) return [];
|
|
|
|
const tokenList = items.map(item => tokenize(item.title || ''));
|
|
|
|
const invertedIndex = new Map();
|
|
for (let i = 0; i < tokenList.length; i++) {
|
|
for (const token of tokenList[i]) {
|
|
const bucket = invertedIndex.get(token);
|
|
if (bucket) bucket.push(i);
|
|
else invertedIndex.set(token, [i]);
|
|
}
|
|
}
|
|
|
|
const clusters = [];
|
|
const assigned = new Set();
|
|
|
|
for (let i = 0; i < items.length; i++) {
|
|
if (assigned.has(i)) continue;
|
|
|
|
const cluster = [i];
|
|
assigned.add(i);
|
|
const tokensI = tokenList[i];
|
|
|
|
const candidates = new Set();
|
|
for (const token of tokensI) {
|
|
const bucket = invertedIndex.get(token);
|
|
if (!bucket) continue;
|
|
for (const idx of bucket) {
|
|
if (idx > i) candidates.add(idx);
|
|
}
|
|
}
|
|
|
|
for (const j of Array.from(candidates).sort((a, b) => a - b)) {
|
|
if (assigned.has(j)) continue;
|
|
if (jaccardSimilarity(tokensI, tokenList[j]) >= SIMILARITY_THRESHOLD) {
|
|
cluster.push(j);
|
|
assigned.add(j);
|
|
}
|
|
}
|
|
|
|
clusters.push(cluster.map(idx => items[idx]));
|
|
}
|
|
|
|
return clusters.map(group => {
|
|
const sorted = [...group].sort((a, b) => {
|
|
const tierDiff = (a.tier ?? 99) - (b.tier ?? 99);
|
|
if (tierDiff !== 0) return tierDiff;
|
|
return new Date(b.pubDate).getTime() - new Date(a.pubDate).getTime();
|
|
});
|
|
|
|
const primary = sorted[0];
|
|
return {
|
|
primaryTitle: primary.title,
|
|
primarySource: primary.source,
|
|
primaryLink: primary.link,
|
|
sourceCount: group.length,
|
|
isAlert: group.some(i => i.isAlert),
|
|
};
|
|
});
|
|
}
|
|
|
|
function countMatches(text, keywords) {
|
|
return keywords.filter(kw => text.includes(kw)).length;
|
|
}
|
|
|
|
export function scoreImportance(cluster) {
|
|
let score = 0;
|
|
const titleLower = (cluster.primaryTitle || '').toLowerCase();
|
|
|
|
score += (cluster.sourceCount || 1) * 10;
|
|
|
|
const violenceN = countMatches(titleLower, VIOLENCE_KEYWORDS);
|
|
if (violenceN > 0) score += 100 + violenceN * 25;
|
|
|
|
const militaryN = countMatches(titleLower, MILITARY_KEYWORDS);
|
|
if (militaryN > 0) score += 80 + militaryN * 20;
|
|
|
|
const unrestN = countMatches(titleLower, UNREST_KEYWORDS);
|
|
if (unrestN > 0) score += 70 + unrestN * 18;
|
|
|
|
const flashpointN = countMatches(titleLower, FLASHPOINT_KEYWORDS);
|
|
if (flashpointN > 0) score += 60 + flashpointN * 15;
|
|
|
|
if ((violenceN > 0 || unrestN > 0) && flashpointN > 0) {
|
|
score *= 1.5;
|
|
}
|
|
|
|
const crisisN = countMatches(titleLower, CRISIS_KEYWORDS);
|
|
if (crisisN > 0) score += 30 + crisisN * 10;
|
|
|
|
const demoteN = countMatches(titleLower, DEMOTE_KEYWORDS);
|
|
if (demoteN > 0) score *= 0.3;
|
|
|
|
if (cluster.isAlert) score += 50;
|
|
|
|
return score;
|
|
}
|
|
|
|
// Note: velocity filter omitted (vs frontend selectTopStories) because digest
|
|
// items lack velocity data. Phase B may add velocity when RPC provides it.
|
|
export function selectTopStories(clusters, maxCount = 8) {
|
|
const scored = clusters
|
|
.map(c => ({ cluster: c, score: scoreImportance(c) }))
|
|
.filter(({ cluster: c, score }) =>
|
|
(c.sourceCount || 1) >= 2 ||
|
|
c.isAlert ||
|
|
score > 100
|
|
)
|
|
.sort((a, b) => b.score - a.score);
|
|
|
|
const selected = [];
|
|
const sourceCount = new Map();
|
|
const MAX_PER_SOURCE = 3;
|
|
|
|
for (const { cluster, score } of scored) {
|
|
const source = cluster.primarySource;
|
|
const count = sourceCount.get(source) || 0;
|
|
if (count < MAX_PER_SOURCE) {
|
|
selected.push({ ...cluster, importanceScore: score });
|
|
sourceCount.set(source, count + 1);
|
|
}
|
|
if (selected.length >= maxCount) break;
|
|
}
|
|
|
|
return selected;
|
|
}
|