mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
feat(brief): analyst prompt v2 — multi-sentence, grounded, story description (#3269)
* feat(brief): analyst prompt v2 — multi-sentence, grounded, includes story description
Shadow-diff of 12 prod stories on 2026-04-21 showed v1 analyst output
indistinguishable from legacy Gemini: identical single-sentence
abstraction ("destabilize / systemic / sovereign risk repricing") with
no named actors, metrics, or dates — in several cases Gemini was MORE
specific.
Root cause: 18–30 word cap compressed context specifics out.
v2 loosens three dials at once so we can settle the A/B:
1. New system prompt WHY_MATTERS_ANALYST_SYSTEM_V2 — 2–3 sentences,
40–70 words, implicit SITUATION→ANALYSIS→(optional) WATCH arc,
MUST cite one specific named actor / metric / date / place from
the context. Analyst path only; gemini path stays on v1.
2. New parser parseWhyMattersV2 — accepts 100–500 chars, rejects
preamble boilerplate + leaked section labels + markdown.
3. Story description plumbed through — endpoint body accepts optional
story.description (≤ 1000 chars, body cap bumped 4 KB → 8 KB).
Cron forwards it when upstream has one (skipped when it equals the
headline — no new signal).
Cache + shadow bumped v3 → v4 / v1 → v2 so fresh output lands on the
first post-deploy cron tick. maxTokens 180 → 260 for ~3× output length.
If shadow-diff 24h after deploy still shows no delta vs gemini, kill
is BRIEF_WHY_MATTERS_PRIMARY=gemini on Vercel (instant, no redeploy).
Tests: 6059 pass (was 6022 + 37 new). typecheck × 2 clean.
* fix(brief): stop truncating v2 multi-sentence output + description in cache hash
Two P1s caught in PR #3269 review.
P1a — cron reparsed endpoint output with v1 single-sentence parser,
silently dropping sentences 2+3 of v2 analyst output. The endpoint had
ALREADY validated the string (parseWhyMattersV2 for analyst path;
parseWhyMatters for gemini). Re-parsing with v1 took only the first
sentence — exact regression #3269 was meant to fix.
Fix: trust the endpoint. Replace re-parse with bounds check (30–500
chars) + stub-echo reject. Added regression test asserting multi-
sentence output reaches the envelope unchanged.
P1b — `story.description` flowed into the analyst prompt but NOT into
the cache hash. Two requests with identical core fields but different
descriptions collided on one cache slot → second caller got prose
grounded in the FIRST caller's description.
Fix: add `description` as the 6th field of `hashBriefStory`. Bump
endpoint cache v4→v5 and shadow v2→v3 so buggy 5-field entries are
dropped. Updated the parity sentinel in brief-llm-core.test.mjs to
match 6-field semantics. Added regression tests covering different-
descriptions-differ and present-vs-absent-differ.
Tests: 6083 pass. typecheck × 2 clean.
This commit is contained in:
@@ -50,6 +50,7 @@ import {
|
||||
buildWhyMattersUserPrompt,
|
||||
hashBriefStory,
|
||||
parseWhyMatters,
|
||||
parseWhyMattersV2,
|
||||
} from '../../shared/brief-llm-core.js';
|
||||
|
||||
// ── Env knobs (read at request entry so Railway/Vercel flips take effect
|
||||
@@ -109,12 +110,16 @@ const SHADOW_TTL_SEC = 7 * 24 * 60 * 60; // 7d
|
||||
|
||||
// ── Validation ────────────────────────────────────────────────────────
|
||||
const VALID_THREAT_LEVELS = new Set(['critical', 'high', 'medium', 'low']);
|
||||
const MAX_BODY_BYTES = 4096;
|
||||
// Bumped body cap to 8 KB: v2 optionally carries `story.description`
|
||||
// (up to 1000 chars) in addition to the other fields, which can push
|
||||
// worst-case payloads past the old 4 KB cap under UTF-8 expansion.
|
||||
const MAX_BODY_BYTES = 8192;
|
||||
const CAPS = {
|
||||
headline: 400,
|
||||
source: 120,
|
||||
category: 80,
|
||||
country: 80,
|
||||
description: 1000,
|
||||
};
|
||||
|
||||
interface StoryPayload {
|
||||
@@ -123,6 +128,8 @@ interface StoryPayload {
|
||||
threatLevel: string;
|
||||
category: string;
|
||||
country: string;
|
||||
/** Optional — gives the LLM a sentence of story context beyond the headline. */
|
||||
description?: string;
|
||||
}
|
||||
|
||||
type ValidationOk = { ok: true; story: StoryPayload };
|
||||
@@ -177,6 +184,19 @@ function validateStoryBody(raw: unknown): ValidationOk | ValidationErr {
|
||||
country = s.country;
|
||||
}
|
||||
|
||||
// description — optional; when present, flows into the analyst prompt
|
||||
// so the LLM has grounded story context beyond the headline.
|
||||
let description: string | undefined;
|
||||
if (s.description !== undefined && s.description !== null) {
|
||||
if (typeof s.description !== 'string') {
|
||||
return { ok: false, status: 400, error: 'story.description must be a string' };
|
||||
}
|
||||
if (s.description.length > CAPS.description) {
|
||||
return { ok: false, status: 400, error: `story.description exceeds ${CAPS.description} chars` };
|
||||
}
|
||||
if (s.description.length > 0) description = s.description;
|
||||
}
|
||||
|
||||
return {
|
||||
ok: true,
|
||||
story: {
|
||||
@@ -185,6 +205,7 @@ function validateStoryBody(raw: unknown): ValidationOk | ValidationErr {
|
||||
threatLevel: s.threatLevel,
|
||||
category: s.category as string,
|
||||
country,
|
||||
...(description ? { description } : {}),
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -200,20 +221,23 @@ async function runAnalystPath(story: StoryPayload, iso2: string | null): Promise
|
||||
{ role: 'system', content: system },
|
||||
{ role: 'user', content: user },
|
||||
],
|
||||
maxTokens: 180,
|
||||
// v2 prompt is 2–3 sentences / 40–70 words — roughly 3× v1's
|
||||
// single-sentence output, so bump maxTokens proportionally.
|
||||
maxTokens: 260,
|
||||
temperature: 0.4,
|
||||
timeoutMs: 15_000,
|
||||
// Provider is pinned via LLM_REASONING_PROVIDER env var (already
|
||||
// set to 'openrouter' in prod). `callLlmReasoning` routes through
|
||||
// the resolveProviderChain based on that env.
|
||||
// Note: no `validate` option. The post-call parseWhyMatters check
|
||||
// below handles rejection by returning null. Using validate inside
|
||||
// Note: no `validate` option. The post-call parseWhyMattersV2
|
||||
// check below handles rejection. Using validate inside
|
||||
// callLlmReasoning would walk the provider chain on parse-reject,
|
||||
// causing duplicate openrouter billings when only one provider is
|
||||
// configured in prod. See todo 245.
|
||||
// causing duplicate openrouter billings (see todo 245).
|
||||
});
|
||||
if (!result) return null;
|
||||
return parseWhyMatters(result.content);
|
||||
// v2 parser accepts multi-sentence output + rejects preamble /
|
||||
// leaked section labels. Analyst path ONLY — gemini path stays on v1.
|
||||
return parseWhyMattersV2(result.content);
|
||||
} catch (err) {
|
||||
console.warn(`[brief-why-matters] analyst path failed: ${err instanceof Error ? err.message : String(err)}`);
|
||||
return null;
|
||||
@@ -338,8 +362,16 @@ export default async function handler(req: Request, ctx?: EdgeContext): Promise<
|
||||
|
||||
// Cache identity.
|
||||
const hash = await hashBriefStory(story);
|
||||
const cacheKey = `brief:llm:whymatters:v3:${hash}`;
|
||||
const shadowKey = `brief:llm:whymatters:shadow:v1:${hash}`;
|
||||
// v5: `hashBriefStory` now includes `description` as a prompt input
|
||||
// so same-story + different description no longer collide on a single
|
||||
// cache entry (P1 caught in PR #3269 review — endpoint could serve
|
||||
// prose grounded in a PREVIOUS caller's description). Bumping v4→v5
|
||||
// invalidates the short-lived v4 entries written under the buggy
|
||||
// 5-field hash so fresh output lands on the next cron tick.
|
||||
const cacheKey = `brief:llm:whymatters:v5:${hash}`;
|
||||
// Shadow v2→v3 for the same reason — any v2 comparison pairs may be
|
||||
// grounded in the wrong description, so the A/B was noisy.
|
||||
const shadowKey = `brief:llm:whymatters:shadow:v3:${hash}`;
|
||||
|
||||
// Cache read. Any infrastructure failure → treat as miss (logged).
|
||||
let cached: WhyMattersEnvelope | null = null;
|
||||
|
||||
@@ -103,15 +103,24 @@ const BRIEF_LLM_SKIP_PROVIDERS = ['ollama', 'groq'];
|
||||
* }} deps
|
||||
*/
|
||||
export async function generateWhyMatters(story, deps) {
|
||||
// Priority path: analyst endpoint. It owns its own cache (v3) so
|
||||
// the cron doesn't touch Redis when the endpoint handles the story.
|
||||
// Priority path: analyst endpoint. It owns its own cache and has
|
||||
// ALREADY validated the output via parseWhyMatters (gemini path) or
|
||||
// parseWhyMattersV2 (analyst path, multi-sentence). We must NOT
|
||||
// re-parse here with the single-sentence v1 parser — that silently
|
||||
// truncates v2's 2–3-sentence output to the first sentence. Trust
|
||||
// the wire shape; only reject an obviously-bad payload (empty, stub
|
||||
// echo, or length outside the legal bounds for either parser).
|
||||
if (typeof deps.callAnalystWhyMatters === 'function') {
|
||||
try {
|
||||
const analystOut = await deps.callAnalystWhyMatters(story);
|
||||
if (typeof analystOut === 'string' && analystOut.length > 0) {
|
||||
const parsed = parseWhyMatters(analystOut);
|
||||
if (parsed) return parsed;
|
||||
console.warn('[brief-llm] callAnalystWhyMatters → fallback: analyst returned unparseable prose');
|
||||
if (typeof analystOut === 'string') {
|
||||
const trimmed = analystOut.trim();
|
||||
const lenOk = trimmed.length >= 30 && trimmed.length <= 500;
|
||||
const notStub = !/^story flagged by your sensitivity/i.test(trimmed);
|
||||
if (lenOk && notStub) return trimmed;
|
||||
console.warn(
|
||||
`[brief-llm] callAnalystWhyMatters → fallback: endpoint returned out-of-bounds or stub (len=${trimmed.length})`,
|
||||
);
|
||||
} else {
|
||||
console.warn('[brief-llm] callAnalystWhyMatters → fallback: null/empty response');
|
||||
}
|
||||
|
||||
@@ -150,6 +150,25 @@ const BRIEF_WHY_MATTERS_ENDPOINT_URL =
|
||||
*/
|
||||
async function callAnalystWhyMatters(story) {
|
||||
if (!RELAY_SECRET) return null;
|
||||
// Forward a trimmed story payload so the endpoint only sees the
|
||||
// fields it validates. `description` is NEW for prompt-v2 — when
|
||||
// upstream has a real one (falls back to headline via
|
||||
// shared/brief-filter.js:134), it gives the LLM a grounded sentence
|
||||
// beyond the headline. Skip when it equals the headline (no signal).
|
||||
const payload = {
|
||||
headline: story.headline ?? '',
|
||||
source: story.source ?? '',
|
||||
threatLevel: story.threatLevel ?? '',
|
||||
category: story.category ?? '',
|
||||
country: story.country ?? '',
|
||||
};
|
||||
if (
|
||||
typeof story.description === 'string' &&
|
||||
story.description.length > 0 &&
|
||||
story.description !== story.headline
|
||||
) {
|
||||
payload.description = story.description;
|
||||
}
|
||||
try {
|
||||
const resp = await fetch(BRIEF_WHY_MATTERS_ENDPOINT_URL, {
|
||||
method: 'POST',
|
||||
@@ -164,7 +183,7 @@ async function callAnalystWhyMatters(story) {
|
||||
'User-Agent': 'worldmonitor-digest-notifications/1.0',
|
||||
Accept: 'application/json',
|
||||
},
|
||||
body: JSON.stringify({ story }),
|
||||
body: JSON.stringify({ story: payload }),
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!resp.ok) {
|
||||
|
||||
7
scripts/shared/brief-llm-core.d.ts
vendored
7
scripts/shared/brief-llm-core.d.ts
vendored
@@ -4,6 +4,9 @@ export interface BriefStoryHashInput {
|
||||
threatLevel?: string;
|
||||
category?: string;
|
||||
country?: string;
|
||||
/** v5: part of cache identity so same-story + different description
|
||||
* don't collide on cached analyst output. */
|
||||
description?: string;
|
||||
}
|
||||
|
||||
export interface BriefStoryPromptInput {
|
||||
@@ -24,3 +27,7 @@ export function buildWhyMattersUserPrompt(story: BriefStoryPromptInput): {
|
||||
export function parseWhyMatters(text: unknown): string | null;
|
||||
|
||||
export function hashBriefStory(story: BriefStoryHashInput): Promise<string>;
|
||||
|
||||
// ── v2 (analyst path only) ────────────────────────────────────────────────
|
||||
export const WHY_MATTERS_ANALYST_SYSTEM_V2: string;
|
||||
export function parseWhyMattersV2(text: unknown): string | null;
|
||||
|
||||
@@ -69,11 +69,20 @@ export function parseWhyMatters(text) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Deterministic 16-char hex hash of the five story fields that flow
|
||||
* into the whyMatters prompt. Same material as the pre-v3 sync
|
||||
* implementation (`scripts/lib/brief-llm.mjs:hashBriefStory`) — a
|
||||
* fixed fixture in tests/brief-llm-core.test.mjs pins the output so a
|
||||
* future refactor cannot silently invalidate every cached entry.
|
||||
* Deterministic 16-char hex hash of the SIX story fields that flow
|
||||
* into the whyMatters prompt (5 core + description). Cache identity
|
||||
* MUST cover every field that shapes the LLM output, or two requests
|
||||
* with the same core fields but different descriptions will share a
|
||||
* cache entry and the second caller gets prose grounded in the first
|
||||
* caller's description (P1 regression caught in PR #3269 review).
|
||||
*
|
||||
* History:
|
||||
* - pre-v3: 5 fields, sync `node:crypto.createHash`.
|
||||
* - v3: moved to Web Crypto (async), same 5 fields.
|
||||
* - v5 (with endpoint cache bump to brief:llm:whymatters:v5:):
|
||||
* 6 fields — `description` added to match the analyst path's
|
||||
* v2 prompt which interpolates `Description: <desc>` between
|
||||
* headline and source.
|
||||
*
|
||||
* Uses Web Crypto so the module is edge-safe. Returns a Promise because
|
||||
* `crypto.subtle.digest` is async; cron call sites are already in an
|
||||
@@ -85,6 +94,7 @@ export function parseWhyMatters(text) {
|
||||
* threatLevel?: string;
|
||||
* category?: string;
|
||||
* country?: string;
|
||||
* description?: string;
|
||||
* }} story
|
||||
* @returns {Promise<string>}
|
||||
*/
|
||||
@@ -95,6 +105,11 @@ export async function hashBriefStory(story) {
|
||||
story.threatLevel ?? '',
|
||||
story.category ?? '',
|
||||
story.country ?? '',
|
||||
// New in v5: description is a prompt input on the analyst path,
|
||||
// so MUST be part of cache identity. Absent on legacy paths →
|
||||
// empty string → deterministic; same-story-same-description pairs
|
||||
// still collide on purpose, different descriptions don't.
|
||||
story.description ?? '',
|
||||
].join('||');
|
||||
const bytes = new TextEncoder().encode(material);
|
||||
const digest = await crypto.subtle.digest('SHA-256', bytes);
|
||||
@@ -105,3 +120,69 @@ export async function hashBriefStory(story) {
|
||||
}
|
||||
return hex.slice(0, 16);
|
||||
}
|
||||
|
||||
// ── Analyst-path prompt v2 (multi-sentence, grounded) ──────────────────────
|
||||
//
|
||||
// Shadow-diff on 12 prod stories (2026-04-21) showed the v1 analyst output
|
||||
// was indistinguishable from the legacy Gemini-only output: identical
|
||||
// single-sentence abstraction-speak ("destabilize / systemic / sovereign
|
||||
// risk repricing") with no named actors, metrics, or dates. Root cause:
|
||||
// the 18–30 word cap compressed the context's specifics out of the LLM's
|
||||
// response. v2 loosens to 40–70 words across 2–3 sentences and REQUIRES
|
||||
// the LLM to ground at least one specific reference from the live context.
|
||||
|
||||
/**
|
||||
* System prompt for the analyst-path v2 (2–3 sentences, ~40–70 words,
|
||||
* grounded in a specific named actor / metric / date / place drawn
|
||||
* from the live context). Shape nudged toward the WMAnalyst chat voice
|
||||
* (SITUATION → ANALYSIS → optional WATCH) but rendered as plain prose,
|
||||
* no section labels in the output.
|
||||
*/
|
||||
export const WHY_MATTERS_ANALYST_SYSTEM_V2 =
|
||||
'You are the lead analyst at WorldMonitor Brief, a geopolitical intelligence magazine. ' +
|
||||
'Using the Live WorldMonitor Context AND the story, write 2–3 sentences (40–70 words total) ' +
|
||||
'on why the story matters.\n\n' +
|
||||
'STRUCTURE:\n' +
|
||||
'1. SITUATION — what is happening right now, grounded in a SPECIFIC named actor, ' +
|
||||
'metric, date, or place drawn from the context.\n' +
|
||||
'2. ANALYSIS — the structural consequence (why this forces a repricing, shifts ' +
|
||||
'the balance, triggers a cascade).\n' +
|
||||
'3. (Optional) WATCH — the threshold or indicator to track, if clear from the context.\n\n' +
|
||||
'HARD CONSTRAINTS:\n' +
|
||||
'- Total length 40–70 words across 2–3 sentences.\n' +
|
||||
'- MUST reference at least ONE specific: named person / country / organization / ' +
|
||||
'number / percentage / date / city — drawn from the context, NOT invented.\n' +
|
||||
'- No preamble ("This matters because…", "The importance of…").\n' +
|
||||
'- No markdown, no bullet points, no section labels in the output — plain prose.\n' +
|
||||
'- Editorial, impersonal, serious. No calls to action, no questions, no quotes.';
|
||||
|
||||
/**
|
||||
* Parse + validate the analyst-path v2 LLM response. Accepts
|
||||
* multi-sentence output (2–3 sentences), 100–500 chars. Otherwise
|
||||
* same rejection semantics as v1 (stub echo, empty) plus explicit
|
||||
* rejection of preamble boilerplate and leaked section labels.
|
||||
*
|
||||
* Returns null when the output is obviously wrong so the caller can
|
||||
* fall through to the next layer.
|
||||
*
|
||||
* @param {unknown} text
|
||||
* @returns {string | null}
|
||||
*/
|
||||
export function parseWhyMattersV2(text) {
|
||||
if (typeof text !== 'string') return null;
|
||||
let s = text.trim();
|
||||
if (!s) return null;
|
||||
// Drop surrounding quotes if the model insisted.
|
||||
s = s.replace(/^[\u201C"']+/, '').replace(/[\u201D"']+$/, '').trim();
|
||||
if (s.length < 100 || s.length > 500) return null;
|
||||
// Reject the stub echo (same as v1).
|
||||
if (/^story flagged by your sensitivity/i.test(s)) return null;
|
||||
// Reject common preamble the system prompt explicitly banned.
|
||||
if (/^(this matters because|the importance of|it is important|importantly,|in summary,|to summarize)/i.test(s)) {
|
||||
return null;
|
||||
}
|
||||
// Reject markdown / section-label leakage (we told it to use plain prose).
|
||||
if (/^(#|-|\*|\d+\.\s)/.test(s)) return null;
|
||||
if (/^(situation|analysis|watch)\s*[:\-–—]/i.test(s)) return null;
|
||||
return s;
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
* LLM latency predictable.
|
||||
*/
|
||||
|
||||
import { WHY_MATTERS_SYSTEM } from '../../../../shared/brief-llm-core.js';
|
||||
import { WHY_MATTERS_ANALYST_SYSTEM_V2 } from '../../../../shared/brief-llm-core.js';
|
||||
import { sanitizeForPrompt } from '../../../_shared/llm-sanitize.js';
|
||||
import type { BriefStoryContext } from './brief-story-context';
|
||||
|
||||
@@ -22,6 +22,9 @@ export interface StoryForPrompt {
|
||||
threatLevel: string;
|
||||
category: string;
|
||||
country: string;
|
||||
/** Optional story description; included when the cron has already
|
||||
* resolved it (post-describe pipeline). Absent on first-pass calls. */
|
||||
description?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -38,6 +41,9 @@ export function sanitizeStoryFields(story: StoryForPrompt): StoryForPrompt {
|
||||
threatLevel: sanitizeForPrompt(story.threatLevel),
|
||||
category: sanitizeForPrompt(story.category),
|
||||
country: sanitizeForPrompt(story.country),
|
||||
...(typeof story.description === 'string' && story.description.length > 0
|
||||
? { description: sanitizeForPrompt(story.description) }
|
||||
: {}),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -103,23 +109,34 @@ export function buildAnalystWhyMattersPrompt(
|
||||
const safe = sanitizeStoryFields(story);
|
||||
const contextBlock = buildContextBlock(context);
|
||||
|
||||
const storyLines = [
|
||||
const storyLineList = [
|
||||
`Headline: ${safe.headline}`,
|
||||
...(safe.description ? [`Description: ${safe.description}`] : []),
|
||||
`Source: ${safe.source}`,
|
||||
`Severity: ${safe.threatLevel}`,
|
||||
`Category: ${safe.category}`,
|
||||
`Country: ${safe.country}`,
|
||||
].join('\n');
|
||||
];
|
||||
const storyLines = storyLineList.join('\n');
|
||||
|
||||
const sections = [];
|
||||
if (contextBlock) {
|
||||
sections.push('# Live WorldMonitor Context', contextBlock);
|
||||
}
|
||||
sections.push('# Story', storyLines);
|
||||
sections.push('One editorial sentence on why this matters:');
|
||||
// Prompt footer matches the system prompt's SITUATION → ANALYSIS →
|
||||
// (optional) WATCH arc, but explicitly restates the grounding
|
||||
// requirement so the model can't ignore it from the system message
|
||||
// alone. Models follow inline instructions more reliably than
|
||||
// system-prompt constraints on longer outputs.
|
||||
sections.push(
|
||||
'Write 2–3 sentences (40–70 words) on why this story matters, grounded in at ' +
|
||||
'least ONE specific actor / metric / date / place drawn from the context above. ' +
|
||||
'Plain prose, no section labels in the output:',
|
||||
);
|
||||
|
||||
return {
|
||||
system: WHY_MATTERS_SYSTEM,
|
||||
system: WHY_MATTERS_ANALYST_SYSTEM_V2,
|
||||
user: sections.join('\n\n'),
|
||||
};
|
||||
}
|
||||
|
||||
7
shared/brief-llm-core.d.ts
vendored
7
shared/brief-llm-core.d.ts
vendored
@@ -4,6 +4,9 @@ export interface BriefStoryHashInput {
|
||||
threatLevel?: string;
|
||||
category?: string;
|
||||
country?: string;
|
||||
/** v5: part of cache identity so same-story + different description
|
||||
* don't collide on cached analyst output. */
|
||||
description?: string;
|
||||
}
|
||||
|
||||
export interface BriefStoryPromptInput {
|
||||
@@ -24,3 +27,7 @@ export function buildWhyMattersUserPrompt(story: BriefStoryPromptInput): {
|
||||
export function parseWhyMatters(text: unknown): string | null;
|
||||
|
||||
export function hashBriefStory(story: BriefStoryHashInput): Promise<string>;
|
||||
|
||||
// ── v2 (analyst path only) ────────────────────────────────────────────────
|
||||
export const WHY_MATTERS_ANALYST_SYSTEM_V2: string;
|
||||
export function parseWhyMattersV2(text: unknown): string | null;
|
||||
|
||||
@@ -69,11 +69,20 @@ export function parseWhyMatters(text) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Deterministic 16-char hex hash of the five story fields that flow
|
||||
* into the whyMatters prompt. Same material as the pre-v3 sync
|
||||
* implementation (`scripts/lib/brief-llm.mjs:hashBriefStory`) — a
|
||||
* fixed fixture in tests/brief-llm-core.test.mjs pins the output so a
|
||||
* future refactor cannot silently invalidate every cached entry.
|
||||
* Deterministic 16-char hex hash of the SIX story fields that flow
|
||||
* into the whyMatters prompt (5 core + description). Cache identity
|
||||
* MUST cover every field that shapes the LLM output, or two requests
|
||||
* with the same core fields but different descriptions will share a
|
||||
* cache entry and the second caller gets prose grounded in the first
|
||||
* caller's description (P1 regression caught in PR #3269 review).
|
||||
*
|
||||
* History:
|
||||
* - pre-v3: 5 fields, sync `node:crypto.createHash`.
|
||||
* - v3: moved to Web Crypto (async), same 5 fields.
|
||||
* - v5 (with endpoint cache bump to brief:llm:whymatters:v5:):
|
||||
* 6 fields — `description` added to match the analyst path's
|
||||
* v2 prompt which interpolates `Description: <desc>` between
|
||||
* headline and source.
|
||||
*
|
||||
* Uses Web Crypto so the module is edge-safe. Returns a Promise because
|
||||
* `crypto.subtle.digest` is async; cron call sites are already in an
|
||||
@@ -85,6 +94,7 @@ export function parseWhyMatters(text) {
|
||||
* threatLevel?: string;
|
||||
* category?: string;
|
||||
* country?: string;
|
||||
* description?: string;
|
||||
* }} story
|
||||
* @returns {Promise<string>}
|
||||
*/
|
||||
@@ -95,6 +105,11 @@ export async function hashBriefStory(story) {
|
||||
story.threatLevel ?? '',
|
||||
story.category ?? '',
|
||||
story.country ?? '',
|
||||
// New in v5: description is a prompt input on the analyst path,
|
||||
// so MUST be part of cache identity. Absent on legacy paths →
|
||||
// empty string → deterministic; same-story-same-description pairs
|
||||
// still collide on purpose, different descriptions don't.
|
||||
story.description ?? '',
|
||||
].join('||');
|
||||
const bytes = new TextEncoder().encode(material);
|
||||
const digest = await crypto.subtle.digest('SHA-256', bytes);
|
||||
@@ -105,3 +120,69 @@ export async function hashBriefStory(story) {
|
||||
}
|
||||
return hex.slice(0, 16);
|
||||
}
|
||||
|
||||
// ── Analyst-path prompt v2 (multi-sentence, grounded) ──────────────────────
|
||||
//
|
||||
// Shadow-diff on 12 prod stories (2026-04-21) showed the v1 analyst output
|
||||
// was indistinguishable from the legacy Gemini-only output: identical
|
||||
// single-sentence abstraction-speak ("destabilize / systemic / sovereign
|
||||
// risk repricing") with no named actors, metrics, or dates. Root cause:
|
||||
// the 18–30 word cap compressed the context's specifics out of the LLM's
|
||||
// response. v2 loosens to 40–70 words across 2–3 sentences and REQUIRES
|
||||
// the LLM to ground at least one specific reference from the live context.
|
||||
|
||||
/**
|
||||
* System prompt for the analyst-path v2 (2–3 sentences, ~40–70 words,
|
||||
* grounded in a specific named actor / metric / date / place drawn
|
||||
* from the live context). Shape nudged toward the WMAnalyst chat voice
|
||||
* (SITUATION → ANALYSIS → optional WATCH) but rendered as plain prose,
|
||||
* no section labels in the output.
|
||||
*/
|
||||
export const WHY_MATTERS_ANALYST_SYSTEM_V2 =
|
||||
'You are the lead analyst at WorldMonitor Brief, a geopolitical intelligence magazine. ' +
|
||||
'Using the Live WorldMonitor Context AND the story, write 2–3 sentences (40–70 words total) ' +
|
||||
'on why the story matters.\n\n' +
|
||||
'STRUCTURE:\n' +
|
||||
'1. SITUATION — what is happening right now, grounded in a SPECIFIC named actor, ' +
|
||||
'metric, date, or place drawn from the context.\n' +
|
||||
'2. ANALYSIS — the structural consequence (why this forces a repricing, shifts ' +
|
||||
'the balance, triggers a cascade).\n' +
|
||||
'3. (Optional) WATCH — the threshold or indicator to track, if clear from the context.\n\n' +
|
||||
'HARD CONSTRAINTS:\n' +
|
||||
'- Total length 40–70 words across 2–3 sentences.\n' +
|
||||
'- MUST reference at least ONE specific: named person / country / organization / ' +
|
||||
'number / percentage / date / city — drawn from the context, NOT invented.\n' +
|
||||
'- No preamble ("This matters because…", "The importance of…").\n' +
|
||||
'- No markdown, no bullet points, no section labels in the output — plain prose.\n' +
|
||||
'- Editorial, impersonal, serious. No calls to action, no questions, no quotes.';
|
||||
|
||||
/**
|
||||
* Parse + validate the analyst-path v2 LLM response. Accepts
|
||||
* multi-sentence output (2–3 sentences), 100–500 chars. Otherwise
|
||||
* same rejection semantics as v1 (stub echo, empty) plus explicit
|
||||
* rejection of preamble boilerplate and leaked section labels.
|
||||
*
|
||||
* Returns null when the output is obviously wrong so the caller can
|
||||
* fall through to the next layer.
|
||||
*
|
||||
* @param {unknown} text
|
||||
* @returns {string | null}
|
||||
*/
|
||||
export function parseWhyMattersV2(text) {
|
||||
if (typeof text !== 'string') return null;
|
||||
let s = text.trim();
|
||||
if (!s) return null;
|
||||
// Drop surrounding quotes if the model insisted.
|
||||
s = s.replace(/^[\u201C"']+/, '').replace(/[\u201D"']+$/, '').trim();
|
||||
if (s.length < 100 || s.length > 500) return null;
|
||||
// Reject the stub echo (same as v1).
|
||||
if (/^story flagged by your sensitivity/i.test(s)) return null;
|
||||
// Reject common preamble the system prompt explicitly banned.
|
||||
if (/^(this matters because|the importance of|it is important|importantly,|in summary,|to summarize)/i.test(s)) {
|
||||
return null;
|
||||
}
|
||||
// Reject markdown / section-label leakage (we told it to use plain prose).
|
||||
if (/^(#|-|\*|\d+\.\s)/.test(s)) return null;
|
||||
if (/^(situation|analysis|watch)\s*[:\-–—]/i.test(s)) return null;
|
||||
return s;
|
||||
}
|
||||
|
||||
@@ -23,8 +23,9 @@ import {
|
||||
parseWhyMatters,
|
||||
} from '../shared/brief-llm-core.js';
|
||||
|
||||
// Pre-extract sync impl, kept inline so the parity test can't drift from
|
||||
// what the cron used to emit.
|
||||
// Mirror impl (sync `node:crypto`) — kept inline so a drift between
|
||||
// the Web Crypto implementation and this sentinel fails the parity
|
||||
// test here first. Must include `description` to match v5 semantics.
|
||||
function legacyHashBriefStory(story) {
|
||||
const material = [
|
||||
story.headline ?? '',
|
||||
@@ -32,6 +33,7 @@ function legacyHashBriefStory(story) {
|
||||
story.threatLevel ?? '',
|
||||
story.category ?? '',
|
||||
story.country ?? '',
|
||||
story.description ?? '',
|
||||
].join('||');
|
||||
return createHash('sha256').update(material).digest('hex').slice(0, 16);
|
||||
}
|
||||
@@ -74,6 +76,30 @@ describe('hashBriefStory — Web Crypto parity with legacy node:crypto', () => {
|
||||
}
|
||||
});
|
||||
|
||||
it('description is part of cache identity (v5 regression guard)', async () => {
|
||||
// Pinned from PR #3269 review P1: adding `description` to the
|
||||
// analyst prompt without adding it to the hash caused same-story-
|
||||
// diff-description to collide on one cache entry, so callers got
|
||||
// prose grounded in a PREVIOUS caller's description.
|
||||
const withDescA = {
|
||||
...FIXTURE,
|
||||
description: 'Tehran publicly reopened commercial shipping.',
|
||||
};
|
||||
const withDescB = {
|
||||
...FIXTURE,
|
||||
description: 'Iran formally blockaded outbound tankers.',
|
||||
};
|
||||
const noDesc = { ...FIXTURE };
|
||||
|
||||
const hashA = await hashBriefStory(withDescA);
|
||||
const hashB = await hashBriefStory(withDescB);
|
||||
const hashNone = await hashBriefStory(noDesc);
|
||||
|
||||
assert.notEqual(hashA, hashB, 'different descriptions must produce different hashes');
|
||||
assert.notEqual(hashA, hashNone, 'description present vs absent must differ');
|
||||
assert.notEqual(hashB, hashNone);
|
||||
});
|
||||
|
||||
it('treats missing fields as empty strings (backcompat)', async () => {
|
||||
const partial = { headline: FIXTURE.headline };
|
||||
const expected = legacyHashBriefStory(partial);
|
||||
@@ -140,3 +166,78 @@ describe('parseWhyMatters — pure sentence validator', () => {
|
||||
assert.equal(parseWhyMatters(s), s);
|
||||
});
|
||||
});
|
||||
|
||||
describe('parseWhyMattersV2 — multi-sentence, analyst-path only', () => {
|
||||
it('lazy-loads', async () => {
|
||||
const mod = await import('../shared/brief-llm-core.js');
|
||||
assert.equal(typeof mod.parseWhyMattersV2, 'function');
|
||||
});
|
||||
|
||||
it('accepts 2–3 sentences totalling 100–500 chars', async () => {
|
||||
const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
|
||||
const good =
|
||||
"Iran's closure of the Strait of Hormuz on April 21 halts roughly 20% of global seaborne oil. " +
|
||||
'The disruption forces an immediate repricing of sovereign risk across Gulf energy exporters. ' +
|
||||
'Watch IMF commentary in the next 48 hours for cascading guidance.';
|
||||
assert.ok(good.length >= 100 && good.length <= 500);
|
||||
assert.equal(parseWhyMattersV2(good), good);
|
||||
});
|
||||
|
||||
it('rejects <100 chars (too terse for the analyst contract)', async () => {
|
||||
const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
|
||||
assert.equal(parseWhyMattersV2('Short.'), null);
|
||||
assert.equal(parseWhyMattersV2('x'.repeat(99)), null);
|
||||
});
|
||||
|
||||
it('rejects >500 chars (runaway generation)', async () => {
|
||||
const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
|
||||
assert.equal(parseWhyMattersV2('a'.repeat(501)), null);
|
||||
});
|
||||
|
||||
it('rejects preamble the system prompt banned', async () => {
|
||||
const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
|
||||
const cases = [
|
||||
'This matters because global energy markets depend on the Strait of Hormuz remaining open for transit and this is therefore a critical development.',
|
||||
'The importance of this development cannot be overstated given the potential for cascading economic impacts across multiple regions and industries.',
|
||||
'It is important to note that the ongoing situation in the Strait of Hormuz has implications that extend far beyond simple maritime concerns.',
|
||||
'Importantly, the developments in the Strait of Hormuz today signal a shift in regional dynamics that could reshape global energy markets for months.',
|
||||
'In summary, the current situation presents significant risks to global stability and requires careful monitoring of diplomatic and military channels.',
|
||||
'To summarize the situation, the Strait of Hormuz developments represent a critical juncture in regional power dynamics with broad implications.',
|
||||
];
|
||||
for (const c of cases) {
|
||||
assert.ok(c.length >= 100 && c.length <= 500);
|
||||
assert.equal(parseWhyMattersV2(c), null, `should reject preamble: ${c.slice(0, 40)}…`);
|
||||
}
|
||||
});
|
||||
|
||||
it('rejects markdown / leaked section labels the prompt told it to omit', async () => {
|
||||
const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
|
||||
const cases = [
|
||||
'# Situation\nIran closed the strait on April 21, halting 20% of seaborne oil. Analysis: sovereign risk repricing follows immediately for Gulf exporters.',
|
||||
'- Bullet one that should not open the response at all given the plain-prose rule in the system message.\n- Bullet two of the banned response.',
|
||||
'* Leading bullet with asterisk that should also trip the markdown rejection because analyst prose should be plain paragraphs across 2–3 sentences.',
|
||||
'1. Numbered point opening the response is equally banned by the system prompt requiring plain prose across two to three sentences with grounded references.',
|
||||
'SITUATION: Iran closed Hormuz today. ANALYSIS: cascading sovereign repricing follows. WATCH: IMF Gulf commentary in 48h. This mirrors the 2019 pattern.',
|
||||
'Analysis — the Strait closure triggers a cascading sovereign risk repricing across Gulf exporters with immediate effect on global markets and shipping lanes.',
|
||||
];
|
||||
for (const c of cases) {
|
||||
assert.equal(parseWhyMattersV2(c), null, `should reject leaked label: ${c.slice(0, 40)}…`);
|
||||
}
|
||||
});
|
||||
|
||||
it('still rejects the stub echo', async () => {
|
||||
const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
|
||||
const stub =
|
||||
'Story flagged by your sensitivity settings. Open for context. This stub is long enough to clear the 100-char floor but must still be rejected as non-enrichment output.';
|
||||
assert.equal(parseWhyMattersV2(stub), null);
|
||||
});
|
||||
|
||||
it('strips surrounding smart-quotes before validation', async () => {
|
||||
const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
|
||||
const raw =
|
||||
'\u201CIran closed the Strait on April 21, halting 20% of seaborne oil. The disruption forces an immediate repricing of sovereign risk across Gulf exporters.\u201D';
|
||||
const out = parseWhyMattersV2(raw);
|
||||
assert.ok(out && !out.startsWith('\u201C'));
|
||||
assert.ok(out && !out.endsWith('\u201D'));
|
||||
});
|
||||
});
|
||||
|
||||
@@ -204,10 +204,9 @@ describe('generateWhyMatters — analyst priority', () => {
|
||||
assert.equal(callLlmInvoked, true, 'legacy callLLM must fire after analyst miss');
|
||||
});
|
||||
|
||||
it('falls through when analyst returns unparseable prose (parser rejection)', async () => {
|
||||
it('falls through when analyst returns out-of-bounds output (too short)', async () => {
|
||||
let callLlmInvoked = false;
|
||||
const out = await generateWhyMatters(story(), {
|
||||
// Too short — fails parseWhyMatters length gate (< 30 chars).
|
||||
callAnalystWhyMatters: async () => 'Short.',
|
||||
callLLM: async () => {
|
||||
callLlmInvoked = true;
|
||||
@@ -217,7 +216,33 @@ describe('generateWhyMatters — analyst priority', () => {
|
||||
cacheSet: async () => {},
|
||||
});
|
||||
assert.equal(out, VALID);
|
||||
assert.equal(callLlmInvoked, true, 'unparseable analyst output must trigger fallback');
|
||||
assert.equal(callLlmInvoked, true, 'out-of-bounds analyst output must trigger fallback');
|
||||
});
|
||||
|
||||
it('preserves multi-sentence v2 analyst output verbatim (P1 regression guard)', async () => {
|
||||
// The endpoint now returns 2–3 sentences validated by parseWhyMattersV2.
|
||||
// The cron MUST NOT reparse with the v1 single-sentence parser, which
|
||||
// would silently truncate the 2nd + 3rd sentences. Caught in PR #3269
|
||||
// review; fixed by trusting the endpoint's own validation and only
|
||||
// rejecting obvious garbage (length / stub echo) here.
|
||||
const multi =
|
||||
"Iran's closure of the Strait of Hormuz on April 21 halts roughly 20% of global seaborne oil. " +
|
||||
'The disruption forces an immediate repricing of sovereign risk across Gulf energy exporters. ' +
|
||||
'Watch IMF commentary in the next 48 hours for cascading guidance.';
|
||||
let callLlmInvoked = false;
|
||||
const out = await generateWhyMatters(story(), {
|
||||
callAnalystWhyMatters: async () => multi,
|
||||
callLLM: async () => {
|
||||
callLlmInvoked = true;
|
||||
return VALID;
|
||||
},
|
||||
cacheGet: async () => null,
|
||||
cacheSet: async () => {},
|
||||
});
|
||||
assert.equal(out, multi, 'multi-sentence v2 output must reach the envelope unchanged');
|
||||
assert.equal(callLlmInvoked, false, 'legacy callLLM must not fire when v2 analyst succeeds');
|
||||
// Sanity: output is actually multi-sentence (not truncated to first).
|
||||
assert.ok(out.split('. ').length >= 2, 'output must retain 2nd+ sentences');
|
||||
});
|
||||
|
||||
it('falls through when analyst throws', async () => {
|
||||
@@ -359,7 +384,8 @@ describe('buildAnalystWhyMattersPrompt — shape and budget', () => {
|
||||
assert.ok(typeof builder === 'function');
|
||||
});
|
||||
|
||||
it('reuses WHY_MATTERS_SYSTEM verbatim', () => {
|
||||
it('uses the analyst v2 system prompt (multi-sentence, grounded)', async () => {
|
||||
const { WHY_MATTERS_ANALYST_SYSTEM_V2 } = await import('../shared/brief-llm-core.js');
|
||||
const { system } = builder(story(), {
|
||||
worldBrief: 'X',
|
||||
countryBrief: '',
|
||||
@@ -369,10 +395,13 @@ describe('buildAnalystWhyMattersPrompt — shape and budget', () => {
|
||||
macroSignals: '',
|
||||
degraded: false,
|
||||
});
|
||||
assert.equal(system, WHY_MATTERS_SYSTEM);
|
||||
assert.equal(system, WHY_MATTERS_ANALYST_SYSTEM_V2);
|
||||
// Contract must still mention the 40–70 word target + grounding rule.
|
||||
assert.match(system, /40–70 words/);
|
||||
assert.match(system, /named person \/ country \/ organization \/ number \/ percentage \/ date \/ city/);
|
||||
});
|
||||
|
||||
it('includes the story fields in the same 5-line format', () => {
|
||||
it('includes story fields with the multi-sentence footer', () => {
|
||||
const { user } = builder(story(), {
|
||||
worldBrief: '',
|
||||
countryBrief: '',
|
||||
@@ -387,7 +416,38 @@ describe('buildAnalystWhyMattersPrompt — shape and budget', () => {
|
||||
assert.match(user, /Severity: critical/);
|
||||
assert.match(user, /Category: Geopolitical Risk/);
|
||||
assert.match(user, /Country: IR/);
|
||||
assert.match(user, /One editorial sentence on why this matters:$/);
|
||||
assert.match(user, /Write 2–3 sentences \(40–70 words\)/);
|
||||
assert.match(user, /grounded in at least ONE specific/);
|
||||
});
|
||||
|
||||
it('includes story description when present', () => {
|
||||
const storyWithDesc = {
|
||||
...story(),
|
||||
description: 'Tehran publicly reopened the Strait of Hormuz to commercial shipping today.',
|
||||
};
|
||||
const { user } = builder(storyWithDesc, {
|
||||
worldBrief: '',
|
||||
countryBrief: '',
|
||||
riskScores: '',
|
||||
forecasts: '',
|
||||
marketData: '',
|
||||
macroSignals: '',
|
||||
degraded: false,
|
||||
});
|
||||
assert.match(user, /Description: Tehran publicly reopened/);
|
||||
});
|
||||
|
||||
it('omits description line when field absent', () => {
|
||||
const { user } = builder(story(), {
|
||||
worldBrief: '',
|
||||
countryBrief: '',
|
||||
riskScores: '',
|
||||
forecasts: '',
|
||||
marketData: '',
|
||||
macroSignals: '',
|
||||
degraded: false,
|
||||
});
|
||||
assert.doesNotMatch(user, /Description:/);
|
||||
});
|
||||
|
||||
it('omits context block when all fields empty', () => {
|
||||
|
||||
Reference in New Issue
Block a user