Files
worldmonitor/scripts/seed-insights.mjs
Elie Habib 80b8071356 feat: server-side AI insights via Railway cron + bootstrap hydration (#1003)
Move the heavy AI insights pipeline (clustering, scoring, LLM brief)
from client-side (15-40s per user) to a 5-min Railway cron job. The
frontend reads pre-computed insights instantly via bootstrap hydration,
with graceful fallback to the existing client-side pipeline.

- Add _clustering.mjs: Jaccard clustering + importance scoring (pure JS)
- Add seed-insights.mjs: Railway cron reads digest, clusters, calls
  Groq/OpenRouter for brief, writes to Redis with LKG preservation
- Register insights key in bootstrap.js FAST_KEYS tier
- Add insights-loader.ts: module-level cached bootstrap reader
- Modify InsightsPanel.ts: server-first path (2-step progress) with
  client fallback (4-step, unchanged behavior)
- Add unit tests for clustering (12) and insights-loader (7)
2026-03-04 20:42:51 +04:00

297 lines
11 KiB
JavaScript

#!/usr/bin/env node
import { loadEnvFile, CHROME_UA, getRedisCredentials, runSeed } from './_seed-utils.mjs';
import { clusterItems, selectTopStories } from './_clustering.mjs';
loadEnvFile(import.meta.url);
const CANONICAL_KEY = 'news:insights:v1';
const DIGEST_KEY = 'news:digest:v1:full:en';
const CACHE_TTL = 600; // 10 min (2x the 5-min cron interval)
const MAX_HEADLINES = 10;
const MAX_HEADLINE_LEN = 500;
const GROQ_MODEL = 'llama-3.1-8b-instant';
const TASK_NARRATION = /^(we need to|i need to|let me|i'll |i should|i will |the task is|the instructions|according to the rules|so we need to|okay[,.]\s*(i'll|let me|so|we need|the task|i should|i will)|sure[,.]\s*(i'll|let me|so|we need|the task|i should|i will|here)|first[, ]+(i|we|let)|to summarize (the headlines|the task|this)|my task (is|was|:)|step \d)/i;
const PROMPT_ECHO = /^(summarize the top story|summarize the key|rules:|here are the rules|the top story is likely)/i;
function stripReasoningPreamble(text) {
const trimmed = text.trim();
if (TASK_NARRATION.test(trimmed) || PROMPT_ECHO.test(trimmed)) {
const lines = trimmed.split('\n').filter(l => l.trim());
const clean = lines.filter(l => !TASK_NARRATION.test(l.trim()) && !PROMPT_ECHO.test(l.trim()));
return clean.join('\n').trim() || trimmed;
}
return trimmed;
}
function sanitizeTitle(title) {
if (typeof title !== 'string') return '';
return title
.replace(/<[^>]*>/g, '')
.replace(/[\x00-\x1f\x7f]/g, '')
.slice(0, MAX_HEADLINE_LEN)
.trim();
}
async function readDigestFromRedis() {
const { url, token } = getRedisCredentials();
const resp = await fetch(`${url}/get/${encodeURIComponent(DIGEST_KEY)}`, {
headers: { Authorization: `Bearer ${token}` },
signal: AbortSignal.timeout(5_000),
});
if (!resp.ok) return null;
const data = await resp.json();
return data.result ? JSON.parse(data.result) : null;
}
async function readExistingInsights() {
const { url, token } = getRedisCredentials();
const resp = await fetch(`${url}/get/${encodeURIComponent(CANONICAL_KEY)}`, {
headers: { Authorization: `Bearer ${token}` },
signal: AbortSignal.timeout(5_000),
});
if (!resp.ok) return null;
const data = await resp.json();
return data.result ? JSON.parse(data.result) : null;
}
async function callGroq(headlines) {
const apiKey = process.env.GROQ_API_KEY;
if (!apiKey) return null;
const headlineText = headlines.map((h, i) => `${i + 1}. ${h}`).join('\n');
const dateContext = `Current date: ${new Date().toISOString().split('T')[0]}. Provide geopolitical context appropriate for the current date.`;
const systemPrompt = `${dateContext}
Summarize the single most important headline in 2 concise sentences MAX (under 60 words total).
Rules:
- Each numbered headline below is a SEPARATE, UNRELATED story
- Pick the ONE most significant headline and summarize ONLY that story
- NEVER combine or merge people, places, or facts from different headlines into one sentence
- Lead with WHAT happened and WHERE - be specific
- NEVER start with "Breaking news", "Good evening", "Tonight", or TV-style openings
- Start directly with the subject of the chosen headline
- No bullet points, no meta-commentary, no elaboration beyond the core facts`;
const userPrompt = `Each headline below is a separate story. Pick the most important ONE and summarize only that story:\n${headlineText}`;
try {
const resp = await fetch('https://api.groq.com/openai/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${apiKey}`,
'Content-Type': 'application/json',
'User-Agent': CHROME_UA,
},
body: JSON.stringify({
model: GROQ_MODEL,
messages: [
{ role: 'system', content: systemPrompt },
{ role: 'user', content: userPrompt },
],
max_tokens: 300,
temperature: 0.3,
}),
signal: AbortSignal.timeout(15_000),
});
if (!resp.ok) {
console.warn(` Groq API error: ${resp.status}`);
return null;
}
const json = await resp.json();
const rawText = json.choices?.[0]?.message?.content?.trim();
if (!rawText) return null;
const text = stripReasoningPreamble(rawText);
return { text, model: json.model || GROQ_MODEL, provider: 'groq' };
} catch (err) {
console.warn(` Groq call failed: ${err.message}`);
return null;
}
}
async function callOpenRouter(headlines) {
const apiKey = process.env.OPENROUTER_API_KEY;
if (!apiKey) return null;
const headlineText = headlines.map((h, i) => `${i + 1}. ${h}`).join('\n');
const dateContext = `Current date: ${new Date().toISOString().split('T')[0]}.`;
try {
const resp = await fetch('https://openrouter.ai/api/v1/chat/completions', {
method: 'POST',
headers: {
'Authorization': `Bearer ${apiKey}`,
'Content-Type': 'application/json',
'HTTP-Referer': 'https://worldmonitor.app',
'X-Title': 'WorldMonitor',
'User-Agent': CHROME_UA,
},
body: JSON.stringify({
model: 'openrouter/free',
messages: [
{ role: 'system', content: `${dateContext} Summarize the single most important headline in 2 concise sentences MAX (under 60 words). Each headline is a SEPARATE story. Pick ONE. NEVER combine facts from different headlines. Lead with WHAT happened and WHERE.` },
{ role: 'user', content: `Pick the most important story:\n${headlineText}` },
],
max_tokens: 300,
temperature: 0.3,
}),
signal: AbortSignal.timeout(20_000),
});
if (!resp.ok) {
console.warn(` OpenRouter API error: ${resp.status}`);
return null;
}
const json = await resp.json();
const rawText = json.choices?.[0]?.message?.content?.trim();
if (!rawText) return null;
const text = stripReasoningPreamble(rawText);
return { text, model: json.model || 'openrouter/free', provider: 'openrouter' };
} catch (err) {
console.warn(` OpenRouter call failed: ${err.message}`);
return null;
}
}
function categorizeStory(title) {
const lower = (title || '').toLowerCase();
const categories = [
{ keywords: ['war', 'attack', 'missile', 'troops', 'airstrike', 'combat', 'military'], cat: 'conflict', threat: 'critical' },
{ keywords: ['killed', 'dead', 'casualties', 'massacre', 'shooting'], cat: 'violence', threat: 'high' },
{ keywords: ['protest', 'uprising', 'riot', 'unrest', 'coup'], cat: 'unrest', threat: 'high' },
{ keywords: ['sanctions', 'tensions', 'escalation', 'threat'], cat: 'geopolitical', threat: 'elevated' },
{ keywords: ['crisis', 'emergency', 'disaster', 'collapse'], cat: 'crisis', threat: 'high' },
{ keywords: ['earthquake', 'flood', 'hurricane', 'wildfire', 'tsunami'], cat: 'natural_disaster', threat: 'elevated' },
{ keywords: ['election', 'vote', 'parliament', 'legislation'], cat: 'political', threat: 'moderate' },
{ keywords: ['market', 'economy', 'trade', 'tariff', 'inflation'], cat: 'economic', threat: 'moderate' },
];
for (const { keywords, cat, threat } of categories) {
if (keywords.some(kw => lower.includes(kw))) {
return { category: cat, threatLevel: threat };
}
}
return { category: 'general', threatLevel: 'moderate' };
}
async function fetchInsights() {
const digest = await readDigestFromRedis();
if (!digest) throw new Error('No news digest found in Redis');
const items = Array.isArray(digest) ? digest :
(digest.items || digest.articles || digest.headlines || []);
if (items.length === 0) {
const keys = typeof digest === 'object' && digest !== null ? Object.keys(digest).join(', ') : typeof digest;
throw new Error(`Digest has no items (shape: ${keys})`);
}
console.log(` Digest items: ${items.length}`);
const normalizedItems = items.map(item => ({
title: sanitizeTitle(item.title || item.headline || ''),
source: item.source || item.feed || '',
link: item.link || item.url || '',
pubDate: item.pubDate || item.publishedAt || item.date || new Date().toISOString(),
isAlert: item.isAlert || false,
tier: item.tier,
})).filter(item => item.title.length > 10);
const clusters = clusterItems(normalizedItems);
console.log(` Clusters: ${clusters.length}`);
const topStories = selectTopStories(clusters, 8);
console.log(` Top stories: ${topStories.length}`);
if (topStories.length === 0) throw new Error('No top stories after scoring');
const headlines = topStories
.slice(0, MAX_HEADLINES)
.map(s => sanitizeTitle(s.primaryTitle));
let worldBrief = '';
let briefProvider = '';
let briefModel = '';
let status = 'ok';
const groqResult = await callGroq(headlines);
if (groqResult) {
worldBrief = groqResult.text;
briefProvider = groqResult.provider;
briefModel = groqResult.model;
console.log(` Brief generated via ${briefProvider} (${briefModel})`);
} else {
const orResult = await callOpenRouter(headlines);
if (orResult) {
worldBrief = orResult.text;
briefProvider = orResult.provider;
briefModel = orResult.model;
console.log(` Brief generated via ${briefProvider} (${briefModel})`);
} else {
status = 'degraded';
console.warn(' No LLM available — publishing degraded (stories without brief)');
}
}
const multiSourceCount = clusters.filter(c => c.sourceCount >= 2).length;
const fastMovingCount = 0; // velocity not available in digest items
const enrichedStories = topStories.map(story => {
const { category, threatLevel } = categorizeStory(story.primaryTitle);
return {
primaryTitle: story.primaryTitle,
primarySource: story.primarySource,
primaryLink: story.primaryLink,
sourceCount: story.sourceCount,
importanceScore: story.importanceScore,
velocity: { level: 'normal', sourcesPerHour: 0 },
isAlert: story.isAlert,
category,
threatLevel,
};
});
const payload = {
worldBrief,
briefProvider,
briefModel,
status,
topStories: enrichedStories,
generatedAt: new Date().toISOString(),
clusterCount: clusters.length,
multiSourceCount,
fastMovingCount,
};
// LKG preservation: don't overwrite "ok" with "degraded"
if (status === 'degraded') {
const existing = await readExistingInsights();
if (existing?.status === 'ok') {
console.log(' LKG preservation: existing payload is "ok", skipping degraded overwrite');
return existing;
}
}
return payload;
}
function validate(data) {
return Array.isArray(data?.topStories) && data.topStories.length >= 1;
}
runSeed('news', 'insights', CANONICAL_KEY, fetchInsights, {
validateFn: validate,
ttlSeconds: CACHE_TTL,
sourceVersion: 'digest-clustering-v1',
}).catch((err) => {
console.error('FATAL:', err.message || err);
process.exit(1);
});