diff --git a/scripts/regional-snapshot/narrative.mjs b/scripts/regional-snapshot/narrative.mjs new file mode 100644 index 000000000..a212c2ab0 --- /dev/null +++ b/scripts/regional-snapshot/narrative.mjs @@ -0,0 +1,409 @@ +// @ts-check +// Regional Intelligence narrative generator. Evidence-grounded LLM synthesis +// over a deterministic RegionalSnapshot. One call per region per 6h cycle. +// +// Phase 1 PR2 — fills in the `narrative` field that Phase 0 left as empty +// stubs, and populates SnapshotMeta.narrative_provider/narrative_model. +// +// Design notes: +// - Single structured-JSON call per region (cheaper + better coherence +// than 6 per-section calls). Parsed into 6 sections + watch_items[]. +// - Skips the 'global' region entirely (too broad to be useful). +// - Ship-empty on any LLM failure: the snapshot is still valuable without +// the narrative, and the diff engine surfaces state changes regardless. +// - Evidence-grounded: each section's evidence_ids MUST be a subset of +// the evidence IDs already computed by collectEvidence(). Unknown IDs +// are silently filtered so a halluci­nated ID never leaks through. +// - Provider chain mirrors seed-insights.mjs / seed-forecasts.mjs: +// Groq → OpenRouter (Gemini Flash). Ollama skipped: the narrative call +// runs on Railway which has no local model. +// - `callLlm` is dependency-injected so unit tests can exercise the full +// prompt + parser without network. + +import { extractFirstJsonObject, cleanJsonText } from '../_llm-json.mjs'; + +const CHROME_UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; + +const NARRATIVE_MAX_TOKENS = 900; +const NARRATIVE_TEMPERATURE = 0.3; +const MAX_ACTORS_IN_PROMPT = 5; +const MAX_EVIDENCE_IN_PROMPT = 15; +const MAX_TRANSMISSIONS_IN_PROMPT = 5; +const MAX_WATCH_ITEMS = 3; + +/** + * Provider chain. Order matters: first provider with a configured env var wins. + */ +const DEFAULT_PROVIDERS = [ + { + name: 'groq', + envKey: 'GROQ_API_KEY', + apiUrl: 'https://api.groq.com/openai/v1/chat/completions', + model: 'llama-3.3-70b-versatile', + timeout: 20_000, + headers: (key) => ({ + Authorization: `Bearer ${key}`, + 'Content-Type': 'application/json', + 'User-Agent': CHROME_UA, + }), + }, + { + name: 'openrouter', + envKey: 'OPENROUTER_API_KEY', + apiUrl: 'https://openrouter.ai/api/v1/chat/completions', + model: 'google/gemini-2.5-flash', + timeout: 30_000, + headers: (key) => ({ + Authorization: `Bearer ${key}`, + 'Content-Type': 'application/json', + 'HTTP-Referer': 'https://worldmonitor.app', + 'X-Title': 'World Monitor', + 'User-Agent': CHROME_UA, + }), + }, +]; + +/** + * Canonical empty narrative. Matches RegionalNarrative shape. + * @returns {import('../../shared/regions.types.js').RegionalNarrative} + */ +export function emptyNarrative() { + return { + situation: { text: '', evidence_ids: [] }, + balance_assessment: { text: '', evidence_ids: [] }, + outlook_24h: { text: '', evidence_ids: [] }, + outlook_7d: { text: '', evidence_ids: [] }, + outlook_30d: { text: '', evidence_ids: [] }, + watch_items: [], + }; +} + +/** + * Return the evidence subset that is actually rendered into the prompt. + * Callers should use this same subset when deriving the valid-evidence-ID + * whitelist for parseNarrativeJson — otherwise the parser could accept + * citations to IDs the model never saw (P2 review finding on #2960). + * + * @param {import('../../shared/regions.types.js').EvidenceItem[]} evidence + * @returns {import('../../shared/regions.types.js').EvidenceItem[]} + */ +export function selectPromptEvidence(evidence) { + if (!Array.isArray(evidence)) return []; + return evidence.slice(0, MAX_EVIDENCE_IN_PROMPT); +} + +/** + * Build the evidence-grounded prompt. Pure — no network. + * + * `evidence` is rendered as-is. Callers that want the prompt-visible + * cap should call `selectPromptEvidence()` first so the same subset + * flows into both the prompt and the parser's evidence whitelist. + * + * @param {{id: string, label: string, forecastLabel: string}} region + * @param {import('../../shared/regions.types.js').RegionalSnapshot} snapshot + * @param {import('../../shared/regions.types.js').EvidenceItem[]} evidence + * @returns {{ systemPrompt: string, userPrompt: string }} + */ +export function buildNarrativePrompt(region, snapshot, evidence) { + const topActors = (snapshot.actors ?? []) + .slice(0, MAX_ACTORS_IN_PROMPT) + .map((a) => `${a.name} (${a.role}, leverage=${a.leverage_score.toFixed(2)})`) + .join(', '); + + const horizonSummary = (snapshot.scenario_sets ?? []) + .map((set) => { + const dominant = [...(set.lanes ?? [])].sort((a, b) => b.probability - a.probability)[0]; + return dominant + ? `${set.horizon}: ${dominant.name} (${Math.round(dominant.probability * 100)}%)` + : `${set.horizon}: (no lanes)`; + }) + .join(' | '); + + const topTransmissions = (snapshot.transmission_paths ?? []) + .slice(0, MAX_TRANSMISSIONS_IN_PROMPT) + .map((t) => `${t.mechanism} via ${t.corridor_id || t.start} (conf=${t.confidence.toFixed(2)})`) + .join('; '); + + const activeTriggers = (snapshot.triggers?.active ?? []) + .map((t) => t.id) + .join(', '); + + const evidenceLines = (evidence ?? []).map((e) => { + const summary = (e.summary ?? '').slice(0, 180); + const conf = typeof e.confidence === 'number' ? e.confidence.toFixed(2) : '0.00'; + return `- ${e.id} [${e.type}, conf=${conf}]: ${summary}`; + }); + const evidenceBlock = evidenceLines.length > 0 + ? evidenceLines.join('\n') + : '(no evidence available — reason over the balance vector alone)'; + + const balance = snapshot.balance; + const balanceLine = [ + `coercive=${balance.coercive_pressure.toFixed(2)}`, + `fragility=${balance.domestic_fragility.toFixed(2)}`, + `capital=${balance.capital_stress.toFixed(2)}`, + `energy_vuln=${balance.energy_vulnerability.toFixed(2)}`, + `alliance=${balance.alliance_cohesion.toFixed(2)}`, + `maritime=${balance.maritime_access.toFixed(2)}`, + `energy_lev=${balance.energy_leverage.toFixed(2)}`, + `net=${balance.net_balance.toFixed(2)}`, + ].join(' '); + + const systemPrompt = [ + `You are a senior geopolitical analyst producing a regional intelligence brief.`, + `Today is ${new Date().toISOString().split('T')[0]}.`, + ``, + `HARD RULES:`, + `- Output ONLY a single JSON object matching the schema below. No prose, no markdown, no code fences.`, + `- Each text field: 1–2 concise sentences, under 280 characters, no bullet points.`, + `- Every evidence_ids entry MUST be one of the IDs listed in the EVIDENCE block. Never invent IDs.`, + `- Ground claims in the evidence and the balance vector. Do not speculate beyond them.`, + `- Use present tense for situation/balance_assessment. Use hedged language for outlooks.`, + `- Neutral, analytical tone. No dramatization, no policy prescriptions.`, + ``, + `SCHEMA:`, + `{`, + ` "situation": { "text": "...", "evidence_ids": ["..."] },`, + ` "balance_assessment": { "text": "...", "evidence_ids": ["..."] },`, + ` "outlook_24h": { "text": "...", "evidence_ids": ["..."] },`, + ` "outlook_7d": { "text": "...", "evidence_ids": ["..."] },`, + ` "outlook_30d": { "text": "...", "evidence_ids": ["..."] },`, + ` "watch_items": [ { "text": "...", "evidence_ids": ["..."] } ]`, + `}`, + ``, + `watch_items: up to ${MAX_WATCH_ITEMS} specific indicators the analyst should monitor.`, + ].join('\n'); + + const userPrompt = [ + `REGION: ${region.label} (${region.id})`, + ``, + `REGIME: ${snapshot.regime?.label ?? 'unknown'}`, + `BALANCE: ${balanceLine}`, + `TOP ACTORS: ${topActors || '(none)'}`, + `SCENARIO LEADS: ${horizonSummary || '(none)'}`, + `TOP TRANSMISSIONS: ${topTransmissions || '(none)'}`, + `ACTIVE TRIGGERS: ${activeTriggers || '(none)'}`, + ``, + `EVIDENCE:`, + evidenceBlock, + ``, + `Produce the JSON object now.`, + ].join('\n'); + + return { systemPrompt, userPrompt }; +} + +/** + * Validate + coerce a single NarrativeSection from raw parsed JSON. + * + * @param {unknown} raw + * @param {Set} validEvidenceIds + * @returns {import('../../shared/regions.types.js').NarrativeSection} + */ +function coerceSection(raw, validEvidenceIds) { + if (!raw || typeof raw !== 'object') return { text: '', evidence_ids: [] }; + const r = /** @type {Record} */ (raw); + const text = typeof r.text === 'string' ? r.text.trim() : ''; + const evidenceIds = Array.isArray(r.evidence_ids) + ? r.evidence_ids + .filter((id) => typeof id === 'string' && validEvidenceIds.has(id)) + : []; + return { text, evidence_ids: evidenceIds }; +} + +/** + * Parse the LLM JSON response into a RegionalNarrative. Filters any + * hallucinated evidence IDs against the set the caller provided. + * Returns { narrative, valid: false } on unparseable input so the caller + * can ship an empty narrative instead. + * + * @param {string} text + * @param {string[]} validEvidenceIds + * @returns {{ narrative: import('../../shared/regions.types.js').RegionalNarrative, valid: boolean }} + */ +export function parseNarrativeJson(text, validEvidenceIds) { + const validSet = new Set(validEvidenceIds); + if (!text || typeof text !== 'string') { + return { narrative: emptyNarrative(), valid: false }; + } + + let parsed; + try { + // Try direct parse first (LLM output is often wrapped in fences). + parsed = JSON.parse(cleanJsonText(text)); + } catch { + const extracted = extractFirstJsonObject(text); + if (!extracted) return { narrative: emptyNarrative(), valid: false }; + try { + parsed = JSON.parse(extracted); + } catch { + return { narrative: emptyNarrative(), valid: false }; + } + } + + if (!parsed || typeof parsed !== 'object') { + return { narrative: emptyNarrative(), valid: false }; + } + + const p = /** @type {Record} */ (parsed); + const watch = Array.isArray(p.watch_items) + ? p.watch_items.slice(0, MAX_WATCH_ITEMS).map((w) => coerceSection(w, validSet)) + : []; + + const narrative = { + situation: coerceSection(p.situation, validSet), + balance_assessment: coerceSection(p.balance_assessment, validSet), + outlook_24h: coerceSection(p.outlook_24h, validSet), + outlook_7d: coerceSection(p.outlook_7d, validSet), + outlook_30d: coerceSection(p.outlook_30d, validSet), + watch_items: watch, + }; + + // Require at least one non-empty section to count as valid. Everything + // else being empty suggests a garbage LLM response we should discard. + const hasAnyText = + narrative.situation.text.length > 0 || + narrative.balance_assessment.text.length > 0 || + narrative.outlook_24h.text.length > 0 || + narrative.outlook_7d.text.length > 0 || + narrative.outlook_30d.text.length > 0 || + narrative.watch_items.some((w) => w.text.length > 0); + + return { narrative, valid: hasAnyText }; +} + +/** + * Real provider-chain caller. Walks DEFAULT_PROVIDERS in order, returning + * the first response that passes the optional `validate` predicate. + * Respects per-provider env gating and timeout. + * + * Callers should pass a `validate` that checks whether the text parses to + * a usable output. Without it, a single provider returning prose or + * truncated JSON would short-circuit the fallback chain — which was the + * P2 finding on #2960. + * + * The returned `model` field reflects what the API actually ran + * (`json.model`), falling back to the provider's declared default. Some + * providers resolve aliases or route to a different concrete model, and + * persisted metadata should report the truth. + * + * @param {{ systemPrompt: string, userPrompt: string }} prompt + * @param {{ validate?: (text: string) => boolean }} [opts] + * @returns {Promise<{ text: string, provider: string, model: string } | null>} + */ +async function callLlmDefault({ systemPrompt, userPrompt }, opts = {}) { + const validate = opts.validate; + for (const provider of DEFAULT_PROVIDERS) { + const envVal = process.env[provider.envKey]; + if (!envVal) continue; + try { + const resp = await fetch(provider.apiUrl, { + method: 'POST', + headers: provider.headers(envVal), + body: JSON.stringify({ + model: provider.model, + messages: [ + { role: 'system', content: systemPrompt }, + { role: 'user', content: userPrompt }, + ], + max_tokens: NARRATIVE_MAX_TOKENS, + temperature: NARRATIVE_TEMPERATURE, + response_format: { type: 'json_object' }, + }), + signal: AbortSignal.timeout(provider.timeout), + }); + + if (!resp.ok) { + console.warn(`[narrative] ${provider.name}: HTTP ${resp.status}`); + continue; + } + + const json = /** @type {any} */ (await resp.json()); + const text = json?.choices?.[0]?.message?.content; + if (typeof text !== 'string' || text.trim().length === 0) { + console.warn(`[narrative] ${provider.name}: empty response`); + continue; + } + + const trimmed = text.trim(); + if (validate && !validate(trimmed)) { + console.warn(`[narrative] ${provider.name}: response failed validation, trying next provider`); + continue; + } + + // Prefer the model the provider actually ran over the requested alias. + const actualModel = typeof json?.model === 'string' && json.model.length > 0 + ? json.model + : provider.model; + + return { text: trimmed, provider: provider.name, model: actualModel }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.warn(`[narrative] ${provider.name}: ${msg}`); + } + } + return null; +} + +/** + * Main entry: generate a narrative for one region. Ship-empty on any failure. + * + * Evidence is capped to `MAX_EVIDENCE_IN_PROMPT` BEFORE prompt construction, + * and the same cap bounds the parser's valid-evidence-ID whitelist, so + * citations can only reference items the model actually saw. + * + * The injected `callLlm` receives a `validate` callback that runs + * `parseNarrativeJson` on each provider's response; providers returning + * prose, truncated JSON, or all-empty objects fall through to the next + * provider instead of short-circuiting the whole chain. + * + * @param {{ id: string, label: string, forecastLabel: string }} region + * @param {import('../../shared/regions.types.js').RegionalSnapshot} snapshot + * @param {import('../../shared/regions.types.js').EvidenceItem[]} evidence + * @param {{ callLlm?: (prompt: { systemPrompt: string, userPrompt: string }, opts?: { validate?: (text: string) => boolean }) => Promise<{ text: string, provider: string, model: string } | null> }} [opts] + * @returns {Promise<{ + * narrative: import('../../shared/regions.types.js').RegionalNarrative, + * provider: string, + * model: string, + * }>} + */ +export async function generateRegionalNarrative(region, snapshot, evidence, opts = {}) { + // Global region is a catch-all; narratives aren't meaningful there. + if (region.id === 'global') { + return { narrative: emptyNarrative(), provider: '', model: '' }; + } + + const callLlm = opts.callLlm ?? callLlmDefault; + // Slice evidence once so the prompt and the parser's whitelist agree on + // exactly which IDs are citable. See selectPromptEvidence docstring. + const promptEvidence = selectPromptEvidence(evidence); + const prompt = buildNarrativePrompt(region, snapshot, promptEvidence); + const validEvidenceIds = promptEvidence.map((e) => e.id); + + // Validator for the default provider-chain caller: a response is + // acceptable iff parseNarrativeJson returns valid=true against the + // prompt-visible evidence set. + const validate = (text) => parseNarrativeJson(text, validEvidenceIds).valid; + + let result; + try { + result = await callLlm(prompt, { validate }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.warn(`[narrative] ${region.id}: callLlm threw: ${msg}`); + return { narrative: emptyNarrative(), provider: '', model: '' }; + } + + if (!result) { + console.warn(`[narrative] ${region.id}: all providers failed, shipping empty narrative`); + return { narrative: emptyNarrative(), provider: '', model: '' }; + } + + const { narrative, valid } = parseNarrativeJson(result.text, validEvidenceIds); + if (!valid) { + console.warn(`[narrative] ${region.id}: JSON parse invalid, shipping empty narrative`); + return { narrative: emptyNarrative(), provider: '', model: '' }; + } + + return { narrative, provider: result.provider, model: result.model }; +} diff --git a/scripts/seed-regional-snapshots.mjs b/scripts/seed-regional-snapshots.mjs index 5818e7021..833c8a393 100644 --- a/scripts/seed-regional-snapshots.mjs +++ b/scripts/seed-regional-snapshots.mjs @@ -8,7 +8,10 @@ * structured trigger thresholds, builds normalized scenario sets, resolves * pre-built transmission templates, and persists to Redis with idempotency. * - * Phase 0: NO LLM narrative call. Phase 1+ adds the narrative layer. + * Phase 1 (PR2): LLM narrative layer added. One structured-JSON call per + * region via generateRegionalNarrative(), ship-empty on any failure. The + * 'global' region is skipped inside the generator. Provider + model flow + * through SnapshotMeta.narrative_provider / narrative_model. * * Architecture: docs/internal/pro-regional-intelligence-upgrade.md * Engineering: docs/internal/pro-regional-intelligence-appendix-engineering.md @@ -39,6 +42,7 @@ import { diffRegionalSnapshot, inferTriggerReason } from './regional-snapshot/di import { persistSnapshot, readLatestSnapshot } from './regional-snapshot/persist-snapshot.mjs'; import { ALL_INPUT_KEYS } from './regional-snapshot/freshness.mjs'; import { generateSnapshotId } from './regional-snapshot/_helpers.mjs'; +import { generateRegionalNarrative, emptyNarrative } from './regional-snapshot/narrative.mjs'; loadEnvFile(import.meta.url); @@ -86,10 +90,13 @@ async function readAllInputs() { * 7. transmissions * 8. mobility (empty in Phase 0) * 9. evidence - * 10. (skip narrative LLM call in Phase 0) - * 11. snapshot_id - * 12. read previous + diff - * 13. final_meta + * 10. snapshot_id + * 11. read previous + derive regime + * 12. build snapshot-for-prompt (no narrative yet) + * 13. LLM narrative call (ship-empty on failure; skipped for 'global') + * 14. splice narrative into tentative snapshot + * 15. diff → trigger_reason + * 16. final_meta with narrative_provider/narrative_model */ async function computeSnapshot(regionId, sources) { // Step 2: pre-meta @@ -122,30 +129,21 @@ async function computeSnapshot(regionId, sources) { // Step 9: evidence chain const evidence = collectEvidence(regionId, sources); - // Step 10: SKIPPED in Phase 0 (no narrative LLM call) - /** @type {import('../shared/regions.types.js').RegionalNarrative} */ - const narrative = { - situation: { text: '', evidence_ids: [] }, - balance_assessment: { text: '', evidence_ids: [] }, - outlook_24h: { text: '', evidence_ids: [] }, - outlook_7d: { text: '', evidence_ids: [] }, - outlook_30d: { text: '', evidence_ids: [] }, - watch_items: [], - }; - - // Step 11: snapshot_id + // Step 10: snapshot_id const snapshotId = generateSnapshotId(); - // Step 12: read previous, run diff + // Step 11: read previous + derive regime. Must happen before narrative + // generation because the prompt consumes the regime label. const previous = await readLatestSnapshot(regionId).catch(() => null); const previousLabel = previous?.regime?.label ?? ''; const regime = buildRegimeState(balance, previousLabel, ''); - // Build a tentative snapshot purely so the diff engine can compare against - // the previously-persisted snapshot. The tentative snapshot's meta is a - // throwaway placeholder; the real meta is built after the diff so trigger_reason - // can be derived from the diff result. - const tentativeSnapshot = { + // Step 12: snapshot-shaped input for the narrative prompt. The narrative + // generator reads regime/balance/actors/scenarios/triggers/evidence from + // this object and does NOT inspect `meta` or the placeholder narrative. + // Meta here is a throwaway — the real meta is built after diff so + // trigger_reason and narrative_* can flow in together. + const snapshotForPrompt = { region_id: regionId, generated_at: Date.now(), meta: buildFinalMeta(pre, { snapshot_id: snapshotId, trigger_reason: 'scheduled_6h' }), @@ -158,16 +156,33 @@ async function computeSnapshot(regionId, sources) { triggers, mobility, evidence, - narrative, + narrative: emptyNarrative(), }; + // Step 13: LLM narrative. Ship-empty on any failure — the snapshot remains + // valuable without the narrative, and the narrative generator itself + // never throws. 'global' is skipped inside the generator. + const region = REGIONS.find((r) => r.id === regionId); + const narrativeResult = region + ? await generateRegionalNarrative(region, snapshotForPrompt, evidence) + : { narrative: emptyNarrative(), provider: '', model: '' }; + + // Step 14: tentative snapshot with the real narrative spliced in. + const tentativeSnapshot = { + ...snapshotForPrompt, + narrative: narrativeResult.narrative, + }; + + // Step 15: diff against previous for trigger_reason inference const diff = diffRegionalSnapshot(previous, tentativeSnapshot); const triggerReason = inferTriggerReason(diff); - // Step 13: final_meta with diff-derived trigger_reason + // Step 16: final_meta with diff-derived trigger_reason and narrative metadata const finalMeta = buildFinalMeta(pre, { snapshot_id: snapshotId, trigger_reason: triggerReason, + narrative_provider: narrativeResult.provider, + narrative_model: narrativeResult.model, }); // Return the snapshot WITHOUT the diff. The diff is a runtime artifact for diff --git a/tests/regional-snapshot-narrative.test.mjs b/tests/regional-snapshot-narrative.test.mjs new file mode 100644 index 000000000..469609f5e --- /dev/null +++ b/tests/regional-snapshot-narrative.test.mjs @@ -0,0 +1,558 @@ +// Tests for the Regional Intelligence narrative generator (Phase 1 PR2). +// Pure-function + injectable-LLM unit tests; no network. Run via: +// npm run test:data + +import { describe, it } from 'node:test'; +import assert from 'node:assert/strict'; + +import { + generateRegionalNarrative, + buildNarrativePrompt, + parseNarrativeJson, + emptyNarrative, + selectPromptEvidence, +} from '../scripts/regional-snapshot/narrative.mjs'; +import { REGIONS } from '../shared/geography.js'; + +// ──────────────────────────────────────────────────────────────────────────── +// Fixtures +// ──────────────────────────────────────────────────────────────────────────── + +const menaRegion = REGIONS.find((r) => r.id === 'mena'); +const globalRegion = REGIONS.find((r) => r.id === 'global'); + +/** Minimal RegionalSnapshot-shaped stub with the fields the prompt reads. */ +function stubSnapshot(overrides = {}) { + return { + region_id: 'mena', + generated_at: 1_700_000_000_000, + meta: { + snapshot_id: 'test-id', + model_version: '0.1.0', + scoring_version: '1.0.0', + geography_version: '1.0.0', + snapshot_confidence: 0.9, + missing_inputs: [], + stale_inputs: [], + valid_until: 0, + trigger_reason: 'scheduled_6h', + narrative_provider: '', + narrative_model: '', + }, + regime: { label: 'coercive_stalemate', previous_label: 'calm', transitioned_at: 0, transition_driver: '' }, + balance: { + coercive_pressure: 0.72, + domestic_fragility: 0.55, + capital_stress: 0.40, + energy_vulnerability: 0.30, + alliance_cohesion: 0.60, + maritime_access: 0.70, + energy_leverage: 0.80, + net_balance: 0.03, + pressures: [], + buffers: [], + }, + actors: [ + { actor_id: 'IR', name: 'Iran', role: 'aggressor', leverage_domains: ['military'], leverage_score: 0.85, delta: 0.05, evidence_ids: [] }, + { actor_id: 'IL', name: 'Israel', role: 'stabilizer', leverage_domains: ['military'], leverage_score: 0.70, delta: 0.00, evidence_ids: [] }, + ], + leverage_edges: [], + scenario_sets: [ + { horizon: '24h', lanes: [ + { name: 'base', probability: 0.5, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'escalation', probability: 0.3, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'containment', probability: 0.15, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'fragmentation', probability: 0.05, trigger_ids: [], consequences: [], transmissions: [] }, + ] }, + { horizon: '7d', lanes: [ + { name: 'base', probability: 0.4, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'escalation', probability: 0.4, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'containment', probability: 0.15, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'fragmentation', probability: 0.05, trigger_ids: [], consequences: [], transmissions: [] }, + ] }, + { horizon: '30d', lanes: [ + { name: 'base', probability: 0.35, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'escalation', probability: 0.45, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'containment', probability: 0.15, trigger_ids: [], consequences: [], transmissions: [] }, + { name: 'fragmentation', probability: 0.05, trigger_ids: [], consequences: [], transmissions: [] }, + ] }, + ], + transmission_paths: [ + { start: 'hormuz', mechanism: 'naval_posture', end: 'oil', severity: 'high', corridor_id: 'hormuz', confidence: 0.85, latency_hours: 12, impacted_asset_class: 'commodity', impacted_regions: ['mena'], magnitude_low: 0, magnitude_high: 0, magnitude_unit: 'pct', template_id: 't1', template_version: '1.0.0' }, + ], + triggers: { + active: [{ id: 'mena_coercive_high', description: '', threshold: {}, activated: true, activated_at: 0, scenario_lane: 'escalation', evidence_ids: [] }], + watching: [], + dormant: [], + }, + mobility: { airspace: [], flight_corridors: [], airports: [], reroute_intensity: 0, notam_closures: [] }, + evidence: [], + narrative: emptyNarrative(), + ...overrides, + }; +} + +const evidenceFixture = [ + { id: 'ev1', type: 'market_signal', source: 'cross-source', summary: 'Iran reports heightened naval posture near Hormuz', confidence: 0.85, observed_at: 1_700_000_000_000, theater: 'persian-gulf', corridor: 'hormuz' }, + { id: 'ev2', type: 'chokepoint_status', source: 'supply-chain', summary: 'Bab el-Mandeb threat level elevated', confidence: 0.9, observed_at: 1_700_000_000_000, theater: '', corridor: 'babelm' }, + { id: 'ev3', type: 'cii_spike', source: 'risk-scores', summary: 'IR CII jumped 12 points', confidence: 0.9, observed_at: 1_700_000_000_000, theater: '', corridor: '' }, +]; + +// ──────────────────────────────────────────────────────────────────────────── +// buildNarrativePrompt +// ──────────────────────────────────────────────────────────────────────────── + +describe('buildNarrativePrompt', () => { + it('returns system + user prompt strings', () => { + const { systemPrompt, userPrompt } = buildNarrativePrompt(menaRegion, stubSnapshot(), evidenceFixture); + assert.ok(typeof systemPrompt === 'string' && systemPrompt.length > 100); + assert.ok(typeof userPrompt === 'string' && userPrompt.length > 100); + }); + + it('includes balance axes in the user prompt', () => { + const { userPrompt } = buildNarrativePrompt(menaRegion, stubSnapshot(), evidenceFixture); + assert.match(userPrompt, /coercive=0\.72/); + assert.match(userPrompt, /net=0\.03/); + }); + + it('includes top actors and regime in the user prompt', () => { + const { userPrompt } = buildNarrativePrompt(menaRegion, stubSnapshot(), evidenceFixture); + assert.match(userPrompt, /Iran.*aggressor/); + assert.match(userPrompt, /coercive_stalemate/); + }); + + it('inlines evidence items with their ids', () => { + const { userPrompt } = buildNarrativePrompt(menaRegion, stubSnapshot(), evidenceFixture); + assert.match(userPrompt, /ev1/); + assert.match(userPrompt, /ev2/); + assert.match(userPrompt, /ev3/); + }); + + it('includes dominant scenario lane per horizon', () => { + const { userPrompt } = buildNarrativePrompt(menaRegion, stubSnapshot(), evidenceFixture); + assert.match(userPrompt, /24h:/); + assert.match(userPrompt, /7d:/); + assert.match(userPrompt, /30d:/); + }); + + it('handles a snapshot with no evidence gracefully', () => { + const { userPrompt } = buildNarrativePrompt(menaRegion, stubSnapshot(), []); + assert.match(userPrompt, /no evidence available/i); + }); + + it('tolerates missing optional snapshot fields without throwing', () => { + const bare = stubSnapshot({ actors: [], scenario_sets: [], transmission_paths: [], triggers: { active: [], watching: [], dormant: [] } }); + assert.doesNotThrow(() => buildNarrativePrompt(menaRegion, bare, [])); + }); +}); + +// ──────────────────────────────────────────────────────────────────────────── +// parseNarrativeJson +// ──────────────────────────────────────────────────────────────────────────── + +describe('parseNarrativeJson', () => { + const validIds = ['ev1', 'ev2', 'ev3']; + + it('parses a clean JSON object into RegionalNarrative', () => { + const text = JSON.stringify({ + situation: { text: 'Iran is flexing naval posture.', evidence_ids: ['ev1'] }, + balance_assessment: { text: 'Pressure 0.72 vs buffers 0.70.', evidence_ids: ['ev3'] }, + outlook_24h: { text: 'Base case holds.', evidence_ids: [] }, + outlook_7d: { text: 'Escalation risk climbs.', evidence_ids: ['ev2'] }, + outlook_30d: { text: 'Uncertainty widens.', evidence_ids: [] }, + watch_items: [ + { text: 'Watch Hormuz transit volume.', evidence_ids: ['ev1'] }, + ], + }); + const { narrative, valid } = parseNarrativeJson(text, validIds); + assert.equal(valid, true); + assert.equal(narrative.situation.text, 'Iran is flexing naval posture.'); + assert.deepEqual(narrative.situation.evidence_ids, ['ev1']); + assert.equal(narrative.watch_items.length, 1); + }); + + it('strips hallucinated evidence IDs not in the provided set', () => { + const text = JSON.stringify({ + situation: { text: 'Some text.', evidence_ids: ['ev1', 'hallucinated', 'ev2'] }, + balance_assessment: { text: 'B.', evidence_ids: ['nope'] }, + outlook_24h: { text: 'O24.', evidence_ids: [] }, + outlook_7d: { text: 'O7.', evidence_ids: [] }, + outlook_30d: { text: 'O30.', evidence_ids: [] }, + watch_items: [], + }); + const { narrative, valid } = parseNarrativeJson(text, validIds); + assert.equal(valid, true); + assert.deepEqual(narrative.situation.evidence_ids, ['ev1', 'ev2']); + assert.deepEqual(narrative.balance_assessment.evidence_ids, []); + }); + + it('extracts JSON from prose-wrapped output', () => { + const text = 'Sure, here is the JSON:\n```json\n' + JSON.stringify({ + situation: { text: 'x', evidence_ids: [] }, + balance_assessment: { text: '', evidence_ids: [] }, + outlook_24h: { text: '', evidence_ids: [] }, + outlook_7d: { text: '', evidence_ids: [] }, + outlook_30d: { text: '', evidence_ids: [] }, + watch_items: [], + }) + '\n```\n'; + const { narrative, valid } = parseNarrativeJson(text, validIds); + assert.equal(valid, true); + assert.equal(narrative.situation.text, 'x'); + }); + + it('returns valid=false for an all-empty JSON object', () => { + const text = JSON.stringify({ + situation: { text: '', evidence_ids: [] }, + balance_assessment: { text: '', evidence_ids: [] }, + outlook_24h: { text: '', evidence_ids: [] }, + outlook_7d: { text: '', evidence_ids: [] }, + outlook_30d: { text: '', evidence_ids: [] }, + watch_items: [], + }); + const { valid } = parseNarrativeJson(text, validIds); + assert.equal(valid, false); + }); + + it('returns valid=false on unparseable garbage', () => { + const { narrative, valid } = parseNarrativeJson('not json at all, just prose', validIds); + assert.equal(valid, false); + assert.deepEqual(narrative, emptyNarrative()); + }); + + it('returns valid=false for null/empty input', () => { + assert.equal(parseNarrativeJson('', validIds).valid, false); + assert.equal(parseNarrativeJson(null, validIds).valid, false); + assert.equal(parseNarrativeJson(undefined, validIds).valid, false); + }); + + it('caps watch_items at the enforced maximum', () => { + const text = JSON.stringify({ + situation: { text: 'x', evidence_ids: [] }, + balance_assessment: { text: '', evidence_ids: [] }, + outlook_24h: { text: '', evidence_ids: [] }, + outlook_7d: { text: '', evidence_ids: [] }, + outlook_30d: { text: '', evidence_ids: [] }, + watch_items: [ + { text: 'w1', evidence_ids: [] }, + { text: 'w2', evidence_ids: [] }, + { text: 'w3', evidence_ids: [] }, + { text: 'w4', evidence_ids: [] }, + { text: 'w5', evidence_ids: [] }, + ], + }); + const { narrative } = parseNarrativeJson(text, validIds); + assert.ok(narrative.watch_items.length <= 3); + }); +}); + +// ──────────────────────────────────────────────────────────────────────────── +// generateRegionalNarrative (with injected callLlm) +// ──────────────────────────────────────────────────────────────────────────── + +describe('generateRegionalNarrative', () => { + function mockCall(text, providerName = 'groq', modelName = 'llama-3.3-70b-versatile') { + return async () => ({ text, provider: providerName, model: modelName }); + } + + const validPayload = { + situation: { text: 'Iran flexes naval posture near Hormuz.', evidence_ids: ['ev1'] }, + balance_assessment: { text: 'Net balance slightly positive.', evidence_ids: ['ev3'] }, + outlook_24h: { text: 'Base case dominates.', evidence_ids: [] }, + outlook_7d: { text: 'Escalation risk rises.', evidence_ids: ['ev2'] }, + outlook_30d: { text: 'Uncertainty widens.', evidence_ids: [] }, + watch_items: [{ text: 'Watch Hormuz transit counts.', evidence_ids: ['ev1'] }], + }; + + it('returns the parsed narrative + provider + model on success', async () => { + const result = await generateRegionalNarrative( + menaRegion, + stubSnapshot(), + evidenceFixture, + { callLlm: mockCall(JSON.stringify(validPayload)) }, + ); + assert.equal(result.provider, 'groq'); + assert.equal(result.model, 'llama-3.3-70b-versatile'); + assert.equal(result.narrative.situation.text, 'Iran flexes naval posture near Hormuz.'); + assert.deepEqual(result.narrative.situation.evidence_ids, ['ev1']); + }); + + it('skips the global region and returns empty narrative', async () => { + let called = false; + const callLlm = async () => { + called = true; + return { text: '', provider: '', model: '' }; + }; + const result = await generateRegionalNarrative(globalRegion, stubSnapshot({ region_id: 'global' }), evidenceFixture, { callLlm }); + assert.equal(called, false, 'global region must not call LLM'); + assert.equal(result.provider, ''); + assert.equal(result.model, ''); + assert.deepEqual(result.narrative, emptyNarrative()); + }); + + it('ships empty narrative when callLlm returns null (all providers failed)', async () => { + const result = await generateRegionalNarrative( + menaRegion, + stubSnapshot(), + evidenceFixture, + { callLlm: async () => null }, + ); + assert.deepEqual(result.narrative, emptyNarrative()); + assert.equal(result.provider, ''); + assert.equal(result.model, ''); + }); + + it('ships empty narrative when the LLM returns garbage text', async () => { + const result = await generateRegionalNarrative( + menaRegion, + stubSnapshot(), + evidenceFixture, + { callLlm: mockCall('not json, just prose from a confused model') }, + ); + assert.deepEqual(result.narrative, emptyNarrative()); + assert.equal(result.provider, ''); + }); + + it('ships empty narrative and does not throw when callLlm throws', async () => { + const result = await generateRegionalNarrative( + menaRegion, + stubSnapshot(), + evidenceFixture, + { + callLlm: async () => { + throw new Error('network blown up'); + }, + }, + ); + assert.deepEqual(result.narrative, emptyNarrative()); + assert.equal(result.provider, ''); + assert.equal(result.model, ''); + }); + + it('filters hallucinated evidence IDs end-to-end', async () => { + const payloadWithHallucination = { + ...validPayload, + situation: { text: 'x', evidence_ids: ['ev1', 'fake-id'] }, + }; + const result = await generateRegionalNarrative( + menaRegion, + stubSnapshot(), + evidenceFixture, + { callLlm: mockCall(JSON.stringify(payloadWithHallucination)) }, + ); + assert.deepEqual(result.narrative.situation.evidence_ids, ['ev1']); + }); + + it('records the provider name the LLM came back from', async () => { + const result = await generateRegionalNarrative( + menaRegion, + stubSnapshot(), + evidenceFixture, + { callLlm: mockCall(JSON.stringify(validPayload), 'openrouter', 'google/gemini-2.5-flash') }, + ); + assert.equal(result.provider, 'openrouter'); + assert.equal(result.model, 'google/gemini-2.5-flash'); + }); +}); + +// ──────────────────────────────────────────────────────────────────────────── +// emptyNarrative shape +// ──────────────────────────────────────────────────────────────────────────── + +describe('emptyNarrative', () => { + it('matches the RegionalNarrative shape with empty fields', () => { + const n = emptyNarrative(); + assert.equal(n.situation.text, ''); + assert.deepEqual(n.situation.evidence_ids, []); + assert.equal(n.balance_assessment.text, ''); + assert.equal(n.outlook_24h.text, ''); + assert.equal(n.outlook_7d.text, ''); + assert.equal(n.outlook_30d.text, ''); + assert.deepEqual(n.watch_items, []); + }); + + it('returns a fresh object each call (no shared mutable state)', () => { + const a = emptyNarrative(); + const b = emptyNarrative(); + a.situation.evidence_ids.push('leaked'); + assert.deepEqual(b.situation.evidence_ids, []); + }); +}); + +// ──────────────────────────────────────────────────────────────────────────── +// Review-fix regression tests (PR #2960 P2/P3 findings) +// ──────────────────────────────────────────────────────────────────────────── + +describe('selectPromptEvidence', () => { + it('caps evidence at the prompt-visible maximum', () => { + const many = Array.from({ length: 25 }, (_, i) => ({ + id: `ev${i}`, + type: 'market_signal', + source: 'test', + summary: `item ${i}`, + confidence: 0.5, + observed_at: 0, + theater: '', + corridor: '', + })); + const sliced = selectPromptEvidence(many); + assert.ok(sliced.length <= 15, `expected ≤15, got ${sliced.length}`); + // Must preserve order — the first N items are what the prompt sees. + assert.equal(sliced[0].id, 'ev0'); + assert.equal(sliced[sliced.length - 1].id, `ev${sliced.length - 1}`); + }); + + it('returns an empty array for non-array input', () => { + assert.deepEqual(selectPromptEvidence(null), []); + assert.deepEqual(selectPromptEvidence(undefined), []); + }); + + it('returns the full array when under the cap', () => { + assert.equal(selectPromptEvidence(evidenceFixture).length, 3); + }); +}); + +describe('provider fallback on malformed response (P2 fix)', () => { + // Simulate the provider-chain behavior of the default callLlm: the + // mock walks a provider list and honors the `validate` callback so the + // chain falls through on parse failure rather than short-circuiting. + function buildFallbackMock(providers) { + return async (_prompt, opts = {}) => { + const validate = opts.validate; + for (const p of providers) { + if (validate && !validate(p.text)) continue; + return { text: p.text, provider: p.provider, model: p.model }; + } + return null; + }; + } + + const validPayload = JSON.stringify({ + situation: { text: 'Iran flexes naval posture.', evidence_ids: ['ev1'] }, + balance_assessment: { text: 'Net balance slightly positive.', evidence_ids: [] }, + outlook_24h: { text: 'Base case dominates.', evidence_ids: [] }, + outlook_7d: { text: 'Escalation risk rises.', evidence_ids: [] }, + outlook_30d: { text: 'Uncertainty widens.', evidence_ids: [] }, + watch_items: [], + }); + + it('falls through when Groq returns prose and OpenRouter returns valid JSON', async () => { + const callLlm = buildFallbackMock([ + { text: 'Sure, here is a summary of the situation...', provider: 'groq', model: 'llama-3.3' }, + { text: validPayload, provider: 'openrouter', model: 'google/gemini-2.5-flash' }, + ]); + const result = await generateRegionalNarrative(menaRegion, stubSnapshot(), evidenceFixture, { callLlm }); + assert.equal(result.provider, 'openrouter'); + assert.equal(result.model, 'google/gemini-2.5-flash'); + assert.equal(result.narrative.situation.text, 'Iran flexes naval posture.'); + }); + + it('falls through when Groq returns truncated JSON and OpenRouter succeeds', async () => { + const callLlm = buildFallbackMock([ + { text: '{"situation": {"text": "Iran flexes nav', provider: 'groq', model: 'llama-3.3' }, + { text: validPayload, provider: 'openrouter', model: 'google/gemini-2.5-flash' }, + ]); + const result = await generateRegionalNarrative(menaRegion, stubSnapshot(), evidenceFixture, { callLlm }); + assert.equal(result.provider, 'openrouter'); + assert.equal(result.narrative.situation.text, 'Iran flexes naval posture.'); + }); + + it('falls through on all-empty-fields JSON from the first provider', async () => { + const allEmpty = JSON.stringify({ + situation: { text: '', evidence_ids: [] }, + balance_assessment: { text: '', evidence_ids: [] }, + outlook_24h: { text: '', evidence_ids: [] }, + outlook_7d: { text: '', evidence_ids: [] }, + outlook_30d: { text: '', evidence_ids: [] }, + watch_items: [], + }); + const callLlm = buildFallbackMock([ + { text: allEmpty, provider: 'groq', model: 'llama-3.3' }, + { text: validPayload, provider: 'openrouter', model: 'google/gemini-2.5-flash' }, + ]); + const result = await generateRegionalNarrative(menaRegion, stubSnapshot(), evidenceFixture, { callLlm }); + assert.equal(result.provider, 'openrouter'); + }); + + it('returns empty narrative when every provider returns malformed output', async () => { + const callLlm = buildFallbackMock([ + { text: 'prose one', provider: 'groq', model: 'llama-3.3' }, + { text: 'prose two', provider: 'openrouter', model: 'google/gemini-2.5-flash' }, + ]); + const result = await generateRegionalNarrative(menaRegion, stubSnapshot(), evidenceFixture, { callLlm }); + assert.deepEqual(result.narrative, emptyNarrative()); + assert.equal(result.provider, ''); + }); +}); + +describe('evidence validator scoped to prompt-visible slice (P2 fix)', () => { + it('rejects hallucinated citations to evidence beyond the visible window', async () => { + // 20 evidence items; the prompt/validator should only see the first 15. + // The LLM cites ev16 (beyond the window) — that citation must be stripped. + const many = Array.from({ length: 20 }, (_, i) => ({ + id: `ev${i}`, + type: 'market_signal', + source: 'test', + summary: `item ${i}`, + confidence: 0.5, + observed_at: 0, + theater: '', + corridor: '', + })); + const payload = JSON.stringify({ + // ev16 is in the full list (index 16) but NOT in the first-15 slice. + situation: { text: 'Test citation filter.', evidence_ids: ['ev0', 'ev16', 'ev14'] }, + balance_assessment: { text: 'B.', evidence_ids: [] }, + outlook_24h: { text: 'O.', evidence_ids: [] }, + outlook_7d: { text: 'O.', evidence_ids: [] }, + outlook_30d: { text: 'O.', evidence_ids: [] }, + watch_items: [], + }); + const callLlm = async () => ({ text: payload, provider: 'groq', model: 'llama-3.3' }); + const result = await generateRegionalNarrative(menaRegion, stubSnapshot(), many, { callLlm }); + // ev0 and ev14 are in the first-15 slice; ev16 is not. + assert.deepEqual(result.narrative.situation.evidence_ids, ['ev0', 'ev14']); + }); + + it('allows citations to any of the first 15 items', async () => { + const many = Array.from({ length: 20 }, (_, i) => ({ + id: `ev${i}`, + type: 'market_signal', + source: 'test', + summary: `item ${i}`, + confidence: 0.5, + observed_at: 0, + theater: '', + corridor: '', + })); + const payload = JSON.stringify({ + situation: { text: 'Cite the edges.', evidence_ids: ['ev0', 'ev14'] }, + balance_assessment: { text: '', evidence_ids: [] }, + outlook_24h: { text: '', evidence_ids: [] }, + outlook_7d: { text: '', evidence_ids: [] }, + outlook_30d: { text: '', evidence_ids: [] }, + watch_items: [], + }); + const callLlm = async () => ({ text: payload, provider: 'groq', model: 'llama-3.3' }); + const result = await generateRegionalNarrative(menaRegion, stubSnapshot(), many, { callLlm }); + assert.deepEqual(result.narrative.situation.evidence_ids, ['ev0', 'ev14']); + }); +}); + +describe('narrative_model records actual provider output (P3 fix)', () => { + it('passes the model value the default caller returned through to the meta', async () => { + // Simulate the default caller picking up json.model (which may resolve + // to a different concrete model than the one requested). + const actualModel = 'llama-3.3-70b-versatile-0325'; + const payload = JSON.stringify({ + situation: { text: 'Test.', evidence_ids: [] }, + balance_assessment: { text: '', evidence_ids: [] }, + outlook_24h: { text: '', evidence_ids: [] }, + outlook_7d: { text: '', evidence_ids: [] }, + outlook_30d: { text: '', evidence_ids: [] }, + watch_items: [], + }); + const callLlm = async () => ({ text: payload, provider: 'groq', model: actualModel }); + const result = await generateRegionalNarrative(menaRegion, stubSnapshot(), evidenceFixture, { callLlm }); + assert.equal(result.model, actualModel); + }); +});