Files
worldmonitor/tests/brief-why-matters-analyst.test.mjs
Elie Habib ec35cf4158 feat(brief): analyst prompt v2 — multi-sentence, grounded, story description (#3269)
* feat(brief): analyst prompt v2 — multi-sentence, grounded, includes story description

Shadow-diff of 12 prod stories on 2026-04-21 showed v1 analyst output
indistinguishable from legacy Gemini: identical single-sentence
abstraction ("destabilize / systemic / sovereign risk repricing") with
no named actors, metrics, or dates — in several cases Gemini was MORE
specific.

Root cause: 18–30 word cap compressed context specifics out.

v2 loosens three dials at once so we can settle the A/B:

1. New system prompt WHY_MATTERS_ANALYST_SYSTEM_V2 — 2–3 sentences,
   40–70 words, implicit SITUATION→ANALYSIS→(optional) WATCH arc,
   MUST cite one specific named actor / metric / date / place from
   the context. Analyst path only; gemini path stays on v1.

2. New parser parseWhyMattersV2 — accepts 100–500 chars, rejects
   preamble boilerplate + leaked section labels + markdown.

3. Story description plumbed through — endpoint body accepts optional
   story.description (≤ 1000 chars, body cap bumped 4 KB → 8 KB).
   Cron forwards it when upstream has one (skipped when it equals the
   headline — no new signal).

Cache + shadow bumped v3 → v4 / v1 → v2 so fresh output lands on the
first post-deploy cron tick. maxTokens 180 → 260 for ~3× output length.

If shadow-diff 24h after deploy still shows no delta vs gemini, kill
is BRIEF_WHY_MATTERS_PRIMARY=gemini on Vercel (instant, no redeploy).

Tests: 6059 pass (was 6022 + 37 new). typecheck × 2 clean.

* fix(brief): stop truncating v2 multi-sentence output + description in cache hash

Two P1s caught in PR #3269 review.

P1a — cron reparsed endpoint output with v1 single-sentence parser,
silently dropping sentences 2+3 of v2 analyst output. The endpoint had
ALREADY validated the string (parseWhyMattersV2 for analyst path;
parseWhyMatters for gemini). Re-parsing with v1 took only the first
sentence — exact regression #3269 was meant to fix.

Fix: trust the endpoint. Replace re-parse with bounds check (30–500
chars) + stub-echo reject. Added regression test asserting multi-
sentence output reaches the envelope unchanged.

P1b — `story.description` flowed into the analyst prompt but NOT into
the cache hash. Two requests with identical core fields but different
descriptions collided on one cache slot → second caller got prose
grounded in the FIRST caller's description.

Fix: add `description` as the 6th field of `hashBriefStory`. Bump
endpoint cache v4→v5 and shadow v2→v3 so buggy 5-field entries are
dropped. Updated the parity sentinel in brief-llm-core.test.mjs to
match 6-field semantics. Added regression tests covering different-
descriptions-differ and present-vs-absent-differ.

Tests: 6083 pass. typecheck × 2 clean.
2026-04-21 22:25:54 +04:00

573 lines
22 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Integration tests for the /api/internal/brief-why-matters edge endpoint
* + the cron's analyst-priority fallback chain.
*
* The endpoint is a .ts file; we test the pure helpers that go into it
* (country normalizer, core hashing, prompt builder, context trim, env
* parsing) plus simulate the handler end-to-end via the imported
* modules. The cron-side `generateWhyMatters` priority chain is covered
* directly via in-process dep injection.
*
* Run: node --test tests/brief-why-matters-analyst.test.mjs
*/
import { describe, it } from 'node:test';
import assert from 'node:assert/strict';
import { generateWhyMatters } from '../scripts/lib/brief-llm.mjs';
import {
hashBriefStory,
parseWhyMatters,
WHY_MATTERS_SYSTEM,
} from '../shared/brief-llm-core.js';
// ── Story fixture matching the cron's actual payload shape
// (shared/brief-filter.js:134-135). ────────────────────────────────────
function story(overrides = {}) {
return {
headline: 'Iran closes Strait of Hormuz',
source: 'Reuters',
threatLevel: 'critical',
category: 'Geopolitical Risk',
country: 'IR',
...overrides,
};
}
// ── Country normalizer ───────────────────────────────────────────────────
describe('normalizeCountryToIso2', () => {
let normalize;
it('loads from server/_shared/country-normalize.ts via tsx or compiled', async () => {
// The module is .ts; in the repo's test setup, node 22 can load .ts
// via tsx. If direct import fails under the test runner, fall back
// to running the logic inline by importing the JSON and a mirror
// function. The logic is trivial so this isn't a flaky compromise.
try {
const mod = await import('../server/_shared/country-normalize.ts');
normalize = mod.normalizeCountryToIso2;
} catch {
const { default: COUNTRY_NAMES } = await import('../shared/country-names.json', {
with: { type: 'json' },
});
const ISO2_SET = new Set(Object.values(COUNTRY_NAMES));
normalize = (raw) => {
if (typeof raw !== 'string') return null;
const trimmed = raw.trim();
if (trimmed === '') return null;
if (trimmed.toLowerCase() === 'global') return null;
if (/^[A-Za-z]{2}$/.test(trimmed)) {
const upper = trimmed.toUpperCase();
return ISO2_SET.has(upper) ? upper : null;
}
const lookup = COUNTRY_NAMES[trimmed.toLowerCase()];
return typeof lookup === 'string' ? lookup : null;
};
}
assert.ok(typeof normalize === 'function');
});
it('passes through valid ISO2 case-insensitively', () => {
assert.equal(normalize('US'), 'US');
assert.equal(normalize('us'), 'US');
assert.equal(normalize('IR'), 'IR');
assert.equal(normalize('gb'), 'GB');
});
it('resolves full names case-insensitively', () => {
assert.equal(normalize('United States'), 'US');
assert.equal(normalize('united states'), 'US');
assert.equal(normalize('Iran'), 'IR');
assert.equal(normalize('United Kingdom'), 'GB');
});
it("'Global' sentinel maps to null (non-country; not an error)", () => {
assert.equal(normalize('Global'), null);
assert.equal(normalize('global'), null);
assert.equal(normalize('GLOBAL'), null);
});
it('rejects unknown / empty / undefined / non-string inputs', () => {
assert.equal(normalize(''), null);
assert.equal(normalize(' '), null);
assert.equal(normalize('Nowhere'), null);
assert.equal(normalize(undefined), null);
assert.equal(normalize(null), null);
assert.equal(normalize(123), null);
});
it('resolves common non-ISO2 abbreviations when they exist in the gazetteer', () => {
// Plan assumed "USA" was not in the gazetteer; it actually is mapped.
// This exercises the full-name-path (3+ chars) with a short abbreviation.
assert.equal(normalize('USA'), 'US');
});
it('rejects ISO2-shaped values not in the gazetteer', () => {
assert.equal(normalize('ZZ'), null); // structurally valid, not in gazetteer
assert.equal(normalize('XY'), null);
});
});
// ── Cache-key stability ──────────────────────────────────────────────────
describe('cache key identity', () => {
it('hashBriefStory stable across the 5-field material', async () => {
const a = await hashBriefStory(story());
const b = await hashBriefStory(story());
assert.equal(a, b);
});
it('hashBriefStory differs when any hash-field differs', async () => {
const baseline = await hashBriefStory(story());
for (const f of ['headline', 'source', 'threatLevel', 'category', 'country']) {
const h = await hashBriefStory(story({ [f]: `${story()[f]}X` }));
assert.notEqual(h, baseline, `${f} must be part of cache identity`);
}
});
});
// ── Deterministic shadow sampling ────────────────────────────────────────
describe('shadow sample deterministic hashing', () => {
// Mirror of the endpoint's sample decision — any drift between this
// and the endpoint would silently halve the sampled population.
function sampleHit(hash16, pct) {
if (pct >= 100) return true;
if (pct <= 0) return false;
const bucket = Number.parseInt(hash16.slice(0, 8), 16) % 100;
return bucket < pct;
}
it('pct=100 always hits', () => {
for (const h of ['0000000000000000', 'ffffffffffffffff', 'abcdef0123456789']) {
assert.equal(sampleHit(h, 100), true);
}
});
it('pct=0 never hits', () => {
for (const h of ['0000000000000000', 'ffffffffffffffff', 'abcdef0123456789']) {
assert.equal(sampleHit(h, 0), false);
}
});
it('pct=25 hits approximately 25% on a bulk sample, and is deterministic', async () => {
let hits = 0;
const N = 400;
const seen = new Map();
for (let i = 0; i < N; i++) {
const h = await hashBriefStory(story({ headline: `fixture-${i}` }));
const first = sampleHit(h, 25);
const second = sampleHit(h, 25);
assert.equal(first, second, `hash ${h} must give the same decision`);
seen.set(h, first);
if (first) hits++;
}
// Tolerance: uniform mod-100 on SHA-256 prefix should be tight.
assert.ok(hits > N * 0.15, `expected > 15% hits, got ${hits}`);
assert.ok(hits < N * 0.35, `expected < 35% hits, got ${hits}`);
});
});
// ── `generateWhyMatters` analyst-priority chain ─────────────────────────
describe('generateWhyMatters — analyst priority', () => {
const VALID = 'Closure of the Strait of Hormuz would spike global oil prices and force a US naval response.';
it('uses the analyst endpoint result when it returns a string', async () => {
let callLlmInvoked = false;
const out = await generateWhyMatters(story(), {
callAnalystWhyMatters: async () => VALID,
callLLM: async () => {
callLlmInvoked = true;
return 'FALLBACK unused';
},
cacheGet: async () => null,
cacheSet: async () => {},
});
assert.equal(out, VALID);
assert.equal(callLlmInvoked, false, 'legacy callLLM must NOT fire when analyst returns');
});
it('falls through to legacy chain when analyst returns null', async () => {
let callLlmInvoked = false;
const out = await generateWhyMatters(story(), {
callAnalystWhyMatters: async () => null,
callLLM: async () => {
callLlmInvoked = true;
return VALID;
},
cacheGet: async () => null,
cacheSet: async () => {},
});
assert.equal(out, VALID);
assert.equal(callLlmInvoked, true, 'legacy callLLM must fire after analyst miss');
});
it('falls through when analyst returns out-of-bounds output (too short)', async () => {
let callLlmInvoked = false;
const out = await generateWhyMatters(story(), {
callAnalystWhyMatters: async () => 'Short.',
callLLM: async () => {
callLlmInvoked = true;
return VALID;
},
cacheGet: async () => null,
cacheSet: async () => {},
});
assert.equal(out, VALID);
assert.equal(callLlmInvoked, true, 'out-of-bounds analyst output must trigger fallback');
});
it('preserves multi-sentence v2 analyst output verbatim (P1 regression guard)', async () => {
// The endpoint now returns 23 sentences validated by parseWhyMattersV2.
// The cron MUST NOT reparse with the v1 single-sentence parser, which
// would silently truncate the 2nd + 3rd sentences. Caught in PR #3269
// review; fixed by trusting the endpoint's own validation and only
// rejecting obvious garbage (length / stub echo) here.
const multi =
"Iran's closure of the Strait of Hormuz on April 21 halts roughly 20% of global seaborne oil. " +
'The disruption forces an immediate repricing of sovereign risk across Gulf energy exporters. ' +
'Watch IMF commentary in the next 48 hours for cascading guidance.';
let callLlmInvoked = false;
const out = await generateWhyMatters(story(), {
callAnalystWhyMatters: async () => multi,
callLLM: async () => {
callLlmInvoked = true;
return VALID;
},
cacheGet: async () => null,
cacheSet: async () => {},
});
assert.equal(out, multi, 'multi-sentence v2 output must reach the envelope unchanged');
assert.equal(callLlmInvoked, false, 'legacy callLLM must not fire when v2 analyst succeeds');
// Sanity: output is actually multi-sentence (not truncated to first).
assert.ok(out.split('. ').length >= 2, 'output must retain 2nd+ sentences');
});
it('falls through when analyst throws', async () => {
let callLlmInvoked = false;
const out = await generateWhyMatters(story(), {
callAnalystWhyMatters: async () => {
throw new Error('network timeout');
},
callLLM: async () => {
callLlmInvoked = true;
return VALID;
},
cacheGet: async () => null,
cacheSet: async () => {},
});
assert.equal(out, VALID);
assert.equal(callLlmInvoked, true);
});
it('returns null when BOTH layers fail (caller uses stub)', async () => {
const out = await generateWhyMatters(story(), {
callAnalystWhyMatters: async () => null,
callLLM: async () => null,
cacheGet: async () => null,
cacheSet: async () => {},
});
assert.equal(out, null);
});
it('no callAnalystWhyMatters dep → legacy chain runs directly (backcompat)', async () => {
let callLlmInvoked = false;
const out = await generateWhyMatters(story(), {
callLLM: async () => {
callLlmInvoked = true;
return VALID;
},
cacheGet: async () => null,
cacheSet: async () => {},
});
assert.equal(out, VALID);
assert.equal(callLlmInvoked, true);
});
});
// ── Body validation (simulated — same rules as endpoint's
// validateStoryBody) ────────────────────────────────────────────────────
describe('endpoint validation contract', () => {
// Mirror of the endpoint's validation so unit tests don't need the
// full edge runtime. Any divergence would surface as a cross-suite
// test regression on the endpoint flow (see "endpoint end-to-end" below).
const VALID_THREAT = new Set(['critical', 'high', 'medium', 'low']);
const CAPS = { headline: 400, source: 120, category: 80, country: 80 };
const MAX_BODY_BYTES = 4096;
function validate(raw) {
if (!raw || typeof raw !== 'object') return { ok: false, msg: 'body' };
const s = raw.story;
if (!s || typeof s !== 'object') return { ok: false, msg: 'body.story' };
for (const f of ['headline', 'source', 'category']) {
if (typeof s[f] !== 'string' || s[f].length === 0) return { ok: false, msg: f };
if (s[f].length > CAPS[f]) return { ok: false, msg: `${f}-length` };
}
if (typeof s.threatLevel !== 'string' || !VALID_THREAT.has(s.threatLevel)) {
return { ok: false, msg: 'threatLevel' };
}
if (s.country !== undefined) {
if (typeof s.country !== 'string') return { ok: false, msg: 'country' };
if (s.country.length > CAPS.country) return { ok: false, msg: 'country-length' };
}
return { ok: true };
}
function measureBytes(obj) {
return new TextEncoder().encode(JSON.stringify(obj)).byteLength;
}
it('accepts a valid payload', () => {
assert.deepEqual(validate({ story: story() }), { ok: true });
});
it('rejects threatLevel="info" (not in the 4-value enum)', () => {
const out = validate({ story: story({ threatLevel: 'info' }) });
assert.equal(out.ok, false);
assert.equal(out.msg, 'threatLevel');
});
it('accepts free-form category (no allowlist)', () => {
for (const cat of ['General', 'Geopolitical Risk', 'Market Activity', 'Humanitarian Crisis']) {
assert.deepEqual(validate({ story: story({ category: cat }) }), { ok: true });
}
});
it('rejects category exceeding length cap', () => {
const long = 'x'.repeat(81);
const out = validate({ story: story({ category: long }) });
assert.equal(out.ok, false);
assert.equal(out.msg, 'category-length');
});
it('rejects empty required fields', () => {
for (const f of ['headline', 'source', 'category']) {
const out = validate({ story: story({ [f]: '' }) });
assert.equal(out.ok, false);
assert.equal(out.msg, f);
}
});
it('accepts empty country + country="Global" + missing country', () => {
assert.deepEqual(validate({ story: story({ country: '' }) }), { ok: true });
assert.deepEqual(validate({ story: story({ country: 'Global' }) }), { ok: true });
const { country: _, ...withoutCountry } = story();
assert.deepEqual(validate({ story: withoutCountry }), { ok: true });
});
it('body cap catches oversize payloads (both Content-Length and post-read)', () => {
const bloated = {
story: {
...story(),
// Artificial oversize payload — would need headline cap bypassed
// to reach in practice, but the total body-byte cap must still fire.
extra: 'x'.repeat(5000),
},
};
assert.ok(measureBytes(bloated) > MAX_BODY_BYTES, 'fixture is oversize');
// Note: body-cap is enforced at the handler level, not the validator.
// We assert the invariant about the measure here; the handler path is
// covered by the endpoint smoke test below.
});
});
// ── Prompt builder shape ──────────────────────────────────────────────
describe('buildAnalystWhyMattersPrompt — shape and budget', () => {
let builder;
it('loads', async () => {
const mod = await import('../server/worldmonitor/intelligence/v1/brief-why-matters-prompt.ts');
builder = mod.buildAnalystWhyMattersPrompt;
assert.ok(typeof builder === 'function');
});
it('uses the analyst v2 system prompt (multi-sentence, grounded)', async () => {
const { WHY_MATTERS_ANALYST_SYSTEM_V2 } = await import('../shared/brief-llm-core.js');
const { system } = builder(story(), {
worldBrief: 'X',
countryBrief: '',
riskScores: '',
forecasts: '',
marketData: '',
macroSignals: '',
degraded: false,
});
assert.equal(system, WHY_MATTERS_ANALYST_SYSTEM_V2);
// Contract must still mention the 4070 word target + grounding rule.
assert.match(system, /4070 words/);
assert.match(system, /named person \/ country \/ organization \/ number \/ percentage \/ date \/ city/);
});
it('includes story fields with the multi-sentence footer', () => {
const { user } = builder(story(), {
worldBrief: '',
countryBrief: '',
riskScores: '',
forecasts: '',
marketData: '',
macroSignals: '',
degraded: false,
});
assert.match(user, /Headline: Iran closes Strait of Hormuz/);
assert.match(user, /Source: Reuters/);
assert.match(user, /Severity: critical/);
assert.match(user, /Category: Geopolitical Risk/);
assert.match(user, /Country: IR/);
assert.match(user, /Write 23 sentences \(4070 words\)/);
assert.match(user, /grounded in at least ONE specific/);
});
it('includes story description when present', () => {
const storyWithDesc = {
...story(),
description: 'Tehran publicly reopened the Strait of Hormuz to commercial shipping today.',
};
const { user } = builder(storyWithDesc, {
worldBrief: '',
countryBrief: '',
riskScores: '',
forecasts: '',
marketData: '',
macroSignals: '',
degraded: false,
});
assert.match(user, /Description: Tehran publicly reopened/);
});
it('omits description line when field absent', () => {
const { user } = builder(story(), {
worldBrief: '',
countryBrief: '',
riskScores: '',
forecasts: '',
marketData: '',
macroSignals: '',
degraded: false,
});
assert.doesNotMatch(user, /Description:/);
});
it('omits context block when all fields empty', () => {
const { user } = builder(story(), {
worldBrief: '',
countryBrief: '',
riskScores: '',
forecasts: '',
marketData: '',
macroSignals: '',
degraded: false,
});
assert.doesNotMatch(user, /# Live WorldMonitor Context/);
});
it('truncates context to stay under budget', () => {
const hugeContext = {
worldBrief: 'x'.repeat(5000),
countryBrief: 'y'.repeat(5000),
riskScores: 'z'.repeat(5000),
forecasts: 'w'.repeat(5000),
marketData: 'v'.repeat(5000),
macroSignals: 'u'.repeat(5000),
degraded: false,
};
const { user } = builder(story(), hugeContext);
// Total user prompt should be bounded. Per plan: context budget ~1700
// + story fields + footer ~250 → under 2.5KB.
assert.ok(user.length < 2500, `prompt should be bounded; got ${user.length} chars`);
});
});
// ── Env flag parsing (endpoint config resolution) ─────────────────────
describe('endpoint env flag parsing', () => {
// Mirror the endpoint's readConfig logic so a drift between this
// expectation and the handler fails one test suite.
function readConfig(env) {
const rawPrimary = (env.BRIEF_WHY_MATTERS_PRIMARY ?? '').trim().toLowerCase();
let primary;
let invalidPrimaryRaw = null;
if (rawPrimary === '' || rawPrimary === 'analyst') primary = 'analyst';
else if (rawPrimary === 'gemini') primary = 'gemini';
else {
primary = 'gemini';
invalidPrimaryRaw = rawPrimary;
}
const shadowEnabled = env.BRIEF_WHY_MATTERS_SHADOW !== '0';
const rawSample = env.BRIEF_WHY_MATTERS_SHADOW_SAMPLE_PCT;
let samplePct = 100;
let invalidSamplePctRaw = null;
if (rawSample !== undefined && rawSample !== '') {
const parsed = Number.parseInt(rawSample, 10);
if (Number.isInteger(parsed) && parsed >= 0 && parsed <= 100 && String(parsed) === rawSample.trim()) {
samplePct = parsed;
} else {
invalidSamplePctRaw = rawSample;
}
}
return { primary, invalidPrimaryRaw, shadowEnabled, samplePct, invalidSamplePctRaw };
}
it('defaults: primary=analyst, shadow=on, sample=100', () => {
const c = readConfig({});
assert.equal(c.primary, 'analyst');
assert.equal(c.shadowEnabled, true);
assert.equal(c.samplePct, 100);
});
it('PRIMARY=gemini is honoured (kill switch)', () => {
const c = readConfig({ BRIEF_WHY_MATTERS_PRIMARY: 'gemini' });
assert.equal(c.primary, 'gemini');
});
it('PRIMARY=analust (typo) falls back to gemini + invalidPrimaryRaw set', () => {
const c = readConfig({ BRIEF_WHY_MATTERS_PRIMARY: 'analust' });
assert.equal(c.primary, 'gemini');
assert.equal(c.invalidPrimaryRaw, 'analust');
});
it('SHADOW disabled only by exact "0"', () => {
for (const v of ['yes', '1', 'true', '', 'on']) {
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW: v }).shadowEnabled, true, `value=${v}`);
}
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW: '0' }).shadowEnabled, false);
});
it('SAMPLE_PCT accepts integer 0100; invalid → 100', () => {
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW_SAMPLE_PCT: '25' }).samplePct, 25);
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW_SAMPLE_PCT: '0' }).samplePct, 0);
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW_SAMPLE_PCT: '100' }).samplePct, 100);
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW_SAMPLE_PCT: '101' }).samplePct, 100);
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW_SAMPLE_PCT: 'foo' }).samplePct, 100);
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW_SAMPLE_PCT: '-5' }).samplePct, 100);
assert.equal(readConfig({ BRIEF_WHY_MATTERS_SHADOW_SAMPLE_PCT: '50.5' }).samplePct, 100);
});
});
// ── Gemini path prompt parity snapshot ────────────────────────────────
describe('Gemini path prompt parity', () => {
it('buildWhyMattersPrompt output is stable (frozen snapshot)', async () => {
const { buildWhyMattersPrompt } = await import('../scripts/lib/brief-llm.mjs');
const { system, user } = buildWhyMattersPrompt(story());
// Snapshot — if either the system prompt or the user prompt shape
// changes, the endpoint's gemini-path output will drift from the
// cron's pre-PR output. Bump BRIEF_WHY_MATTERS_PRIMARY=gemini
// rollout risk accordingly.
assert.match(system, /ONE concise sentence \(1830 words\)/);
assert.equal(
user.split('\n').slice(0, 5).join('\n'),
[
'Headline: Iran closes Strait of Hormuz',
'Source: Reuters',
'Severity: critical',
'Category: Geopolitical Risk',
'Country: IR',
].join('\n'),
);
assert.ok(user.endsWith('One editorial sentence on why this matters:'));
});
});