feat(brief): analyst prompt v2 — multi-sentence, grounded, story description (#3269)

* feat(brief): analyst prompt v2 — multi-sentence, grounded, includes story description Shadow-diff of 12 prod stories on 2026-04-21 showed v1 analyst output indistinguishable from legacy Gemini: identical single-sentence abstraction ("destabilize / systemic / sovereign risk repricing") with no named actors, metrics, or dates — in several cases Gemini was MORE specific. Root cause: 18–30 word cap compressed context specifics out. v2 loosens three dials at once so we can settle the A/B: 1. New system prompt WHY_MATTERS_ANALYST_SYSTEM_V2 — 2–3 sentences, 40–70 words, implicit SITUATION→ANALYSIS→(optional) WATCH arc, MUST cite one specific named actor / metric / date / place from the context. Analyst path only; gemini path stays on v1. 2. New parser parseWhyMattersV2 — accepts 100–500 chars, rejects preamble boilerplate + leaked section labels + markdown. 3. Story description plumbed through — endpoint body accepts optional story.description (≤ 1000 chars, body cap bumped 4 KB → 8 KB). Cron forwards it when upstream has one (skipped when it equals the headline — no new signal). Cache + shadow bumped v3 → v4 / v1 → v2 so fresh output lands on the first post-deploy cron tick. maxTokens 180 → 260 for ~3× output length. If shadow-diff 24h after deploy still shows no delta vs gemini, kill is BRIEF_WHY_MATTERS_PRIMARY=gemini on Vercel (instant, no redeploy). Tests: 6059 pass (was 6022 + 37 new). typecheck × 2 clean. * fix(brief): stop truncating v2 multi-sentence output + description in cache hash Two P1s caught in PR #3269 review. P1a — cron reparsed endpoint output with v1 single-sentence parser, silently dropping sentences 2+3 of v2 analyst output. The endpoint had ALREADY validated the string (parseWhyMattersV2 for analyst path; parseWhyMatters for gemini). Re-parsing with v1 took only the first sentence — exact regression #3269 was meant to fix. Fix: trust the endpoint. Replace re-parse with bounds check (30–500 chars) + stub-echo reject. Added regression test asserting multi- sentence output reaches the envelope unchanged. P1b — `story.description` flowed into the analyst prompt but NOT into the cache hash. Two requests with identical core fields but different descriptions collided on one cache slot → second caller got prose grounded in the FIRST caller's description. Fix: add `description` as the 6th field of `hashBriefStory`. Bump endpoint cache v4→v5 and shadow v2→v3 so buggy 5-field entries are dropped. Updated the parity sentinel in brief-llm-core.test.mjs to match 6-field semantics. Added regression tests covering different- descriptions-differ and present-vs-absent-differ. Tests: 6083 pass. typecheck × 2 clean.
2026-04-25 17:14:57 +02:00 · 2026-04-21 22:25:54 +04:00
parent 048bb8bb52
commit ec35cf4158
10 changed files with 454 additions and 40 deletions
--- a/api/internal/brief-why-matters.ts
+++ b/api/internal/brief-why-matters.ts
@@ -50,6 +50,7 @@ import {
  buildWhyMattersUserPrompt,
  hashBriefStory,
  parseWhyMatters,
+  parseWhyMattersV2,
 } from '../../shared/brief-llm-core.js';

 // ── Env knobs (read at request entry so Railway/Vercel flips take effect
@@ -109,12 +110,16 @@ const SHADOW_TTL_SEC = 7 * 24 * 60 * 60; // 7d

 // ── Validation ────────────────────────────────────────────────────────
 const VALID_THREAT_LEVELS = new Set(['critical', 'high', 'medium', 'low']);
-const MAX_BODY_BYTES = 4096;
+// Bumped body cap to 8 KB: v2 optionally carries `story.description`
+// (up to 1000 chars) in addition to the other fields, which can push
+// worst-case payloads past the old 4 KB cap under UTF-8 expansion.
+const MAX_BODY_BYTES = 8192;
 const CAPS = {
  headline: 400,
  source: 120,
  category: 80,
  country: 80,
+  description: 1000,
 };

 interface StoryPayload {
@@ -123,6 +128,8 @@ interface StoryPayload {
  threatLevel: string;
  category: string;
  country: string;
+  /** Optional — gives the LLM a sentence of story context beyond the headline. */
+  description?: string;
 }

 type ValidationOk = { ok: true; story: StoryPayload };
@@ -177,6 +184,19 @@ function validateStoryBody(raw: unknown): ValidationOk | ValidationErr {
    country = s.country;
  }

+  // description — optional; when present, flows into the analyst prompt
+  // so the LLM has grounded story context beyond the headline.
+  let description: string | undefined;
+  if (s.description !== undefined && s.description !== null) {
+    if (typeof s.description !== 'string') {
+      return { ok: false, status: 400, error: 'story.description must be a string' };
+    }
+    if (s.description.length > CAPS.description) {
+      return { ok: false, status: 400, error: `story.description exceeds ${CAPS.description} chars` };
+    }
+    if (s.description.length > 0) description = s.description;
+  }
+
  return {
    ok: true,
    story: {
@@ -185,6 +205,7 @@ function validateStoryBody(raw: unknown): ValidationOk | ValidationErr {
      threatLevel: s.threatLevel,
      category: s.category as string,
      country,
+      ...(description ? { description } : {}),
    },
  };
 }
@@ -200,20 +221,23 @@ async function runAnalystPath(story: StoryPayload, iso2: string | null): Promise
        { role: 'system', content: system },
        { role: 'user', content: user },
      ],
-      maxTokens: 180,
+      // v2 prompt is 2–3 sentences / 40–70 words — roughly 3× v1's
+      // single-sentence output, so bump maxTokens proportionally.
+      maxTokens: 260,
      temperature: 0.4,
      timeoutMs: 15_000,
      // Provider is pinned via LLM_REASONING_PROVIDER env var (already
      // set to 'openrouter' in prod). `callLlmReasoning` routes through
      // the resolveProviderChain based on that env.
-      // Note: no `validate` option. The post-call parseWhyMatters check
-      // below handles rejection by returning null. Using validate inside
+      // Note: no `validate` option. The post-call parseWhyMattersV2
+      // check below handles rejection. Using validate inside
      // callLlmReasoning would walk the provider chain on parse-reject,
-      // causing duplicate openrouter billings when only one provider is
-      // configured in prod. See todo 245.
+      // causing duplicate openrouter billings (see todo 245).
    });
    if (!result) return null;
-    return parseWhyMatters(result.content);
+    // v2 parser accepts multi-sentence output + rejects preamble /
+    // leaked section labels. Analyst path ONLY — gemini path stays on v1.
+    return parseWhyMattersV2(result.content);
  } catch (err) {
    console.warn(`[brief-why-matters] analyst path failed: ${err instanceof Error ? err.message : String(err)}`);
    return null;
@@ -338,8 +362,16 @@ export default async function handler(req: Request, ctx?: EdgeContext): Promise<

  // Cache identity.
  const hash = await hashBriefStory(story);
-  const cacheKey = `brief:llm:whymatters:v3:${hash}`;
-  const shadowKey = `brief:llm:whymatters:shadow:v1:${hash}`;
+  // v5: `hashBriefStory` now includes `description` as a prompt input
+  // so same-story + different description no longer collide on a single
+  // cache entry (P1 caught in PR #3269 review — endpoint could serve
+  // prose grounded in a PREVIOUS caller's description). Bumping v4→v5
+  // invalidates the short-lived v4 entries written under the buggy
+  // 5-field hash so fresh output lands on the next cron tick.
+  const cacheKey = `brief:llm:whymatters:v5:${hash}`;
+  // Shadow v2→v3 for the same reason — any v2 comparison pairs may be
+  // grounded in the wrong description, so the A/B was noisy.
+  const shadowKey = `brief:llm:whymatters:shadow:v3:${hash}`;

  // Cache read. Any infrastructure failure → treat as miss (logged).
  let cached: WhyMattersEnvelope | null = null;
--- a/scripts/lib/brief-llm.mjs
+++ b/scripts/lib/brief-llm.mjs
@@ -103,15 +103,24 @@ const BRIEF_LLM_SKIP_PROVIDERS = ['ollama', 'groq'];
 * }} deps
 */
 export async function generateWhyMatters(story, deps) {
-  // Priority path: analyst endpoint. It owns its own cache (v3) so
-  // the cron doesn't touch Redis when the endpoint handles the story.
+  // Priority path: analyst endpoint. It owns its own cache and has
+  // ALREADY validated the output via parseWhyMatters (gemini path) or
+  // parseWhyMattersV2 (analyst path, multi-sentence). We must NOT
+  // re-parse here with the single-sentence v1 parser — that silently
+  // truncates v2's 2–3-sentence output to the first sentence. Trust
+  // the wire shape; only reject an obviously-bad payload (empty, stub
+  // echo, or length outside the legal bounds for either parser).
  if (typeof deps.callAnalystWhyMatters === 'function') {
    try {
      const analystOut = await deps.callAnalystWhyMatters(story);
-      if (typeof analystOut === 'string' && analystOut.length > 0) {
-        const parsed = parseWhyMatters(analystOut);
-        if (parsed) return parsed;
-        console.warn('[brief-llm] callAnalystWhyMatters → fallback: analyst returned unparseable prose');
+      if (typeof analystOut === 'string') {
+        const trimmed = analystOut.trim();
+        const lenOk = trimmed.length >= 30 && trimmed.length <= 500;
+        const notStub = !/^story flagged by your sensitivity/i.test(trimmed);
+        if (lenOk && notStub) return trimmed;
+        console.warn(
+          `[brief-llm] callAnalystWhyMatters → fallback: endpoint returned out-of-bounds or stub (len=${trimmed.length})`,
+        );
      } else {
        console.warn('[brief-llm] callAnalystWhyMatters → fallback: null/empty response');
      }
--- a/scripts/seed-digest-notifications.mjs
+++ b/scripts/seed-digest-notifications.mjs
@@ -150,6 +150,25 @@ const BRIEF_WHY_MATTERS_ENDPOINT_URL =
 */
 async function callAnalystWhyMatters(story) {
  if (!RELAY_SECRET) return null;
+  // Forward a trimmed story payload so the endpoint only sees the
+  // fields it validates. `description` is NEW for prompt-v2 — when
+  // upstream has a real one (falls back to headline via
+  // shared/brief-filter.js:134), it gives the LLM a grounded sentence
+  // beyond the headline. Skip when it equals the headline (no signal).
+  const payload = {
+    headline: story.headline ?? '',
+    source: story.source ?? '',
+    threatLevel: story.threatLevel ?? '',
+    category: story.category ?? '',
+    country: story.country ?? '',
+  };
+  if (
+    typeof story.description === 'string' &&
+    story.description.length > 0 &&
+    story.description !== story.headline
+  ) {
+    payload.description = story.description;
+  }
  try {
    const resp = await fetch(BRIEF_WHY_MATTERS_ENDPOINT_URL, {
      method: 'POST',
@@ -164,7 +183,7 @@ async function callAnalystWhyMatters(story) {
        'User-Agent': 'worldmonitor-digest-notifications/1.0',
        Accept: 'application/json',
      },
-      body: JSON.stringify({ story }),
+      body: JSON.stringify({ story: payload }),
      signal: AbortSignal.timeout(15_000),
    });
    if (!resp.ok) {
--- a/scripts/shared/brief-llm-core.d.ts
+++ b/scripts/shared/brief-llm-core.d.ts
@@ -4,6 +4,9 @@ export interface BriefStoryHashInput {
  threatLevel?: string;
  category?: string;
  country?: string;
+  /** v5: part of cache identity so same-story + different description
+   *  don't collide on cached analyst output. */
+  description?: string;
 }

 export interface BriefStoryPromptInput {
@@ -24,3 +27,7 @@ export function buildWhyMattersUserPrompt(story: BriefStoryPromptInput): {
 export function parseWhyMatters(text: unknown): string | null;

 export function hashBriefStory(story: BriefStoryHashInput): Promise<string>;
+
+// ── v2 (analyst path only) ────────────────────────────────────────────────
+export const WHY_MATTERS_ANALYST_SYSTEM_V2: string;
+export function parseWhyMattersV2(text: unknown): string | null;
--- a/scripts/shared/brief-llm-core.js
+++ b/scripts/shared/brief-llm-core.js
@@ -69,11 +69,20 @@ export function parseWhyMatters(text) {
 }

 /**
- * Deterministic 16-char hex hash of the five story fields that flow
- * into the whyMatters prompt. Same material as the pre-v3 sync
- * implementation (`scripts/lib/brief-llm.mjs:hashBriefStory`) — a
- * fixed fixture in tests/brief-llm-core.test.mjs pins the output so a
- * future refactor cannot silently invalidate every cached entry.
+ * Deterministic 16-char hex hash of the SIX story fields that flow
+ * into the whyMatters prompt (5 core + description). Cache identity
+ * MUST cover every field that shapes the LLM output, or two requests
+ * with the same core fields but different descriptions will share a
+ * cache entry and the second caller gets prose grounded in the first
+ * caller's description (P1 regression caught in PR #3269 review).
+ *
+ * History:
+ *   - pre-v3: 5 fields, sync `node:crypto.createHash`.
+ *   - v3: moved to Web Crypto (async), same 5 fields.
+ *   - v5 (with endpoint cache bump to brief:llm:whymatters:v5:):
+ *     6 fields — `description` added to match the analyst path's
+ *     v2 prompt which interpolates `Description: <desc>` between
+ *     headline and source.
 *
 * Uses Web Crypto so the module is edge-safe. Returns a Promise because
 * `crypto.subtle.digest` is async; cron call sites are already in an
@@ -85,6 +94,7 @@ export function parseWhyMatters(text) {
 *   threatLevel?: string;
 *   category?: string;
 *   country?: string;
+ *   description?: string;
 * }} story
 * @returns {Promise<string>}
 */
@@ -95,6 +105,11 @@ export async function hashBriefStory(story) {
    story.threatLevel ?? '',
    story.category ?? '',
    story.country ?? '',
+    // New in v5: description is a prompt input on the analyst path,
+    // so MUST be part of cache identity. Absent on legacy paths →
+    // empty string → deterministic; same-story-same-description pairs
+    // still collide on purpose, different descriptions don't.
+    story.description ?? '',
  ].join('||');
  const bytes = new TextEncoder().encode(material);
  const digest = await crypto.subtle.digest('SHA-256', bytes);
@@ -105,3 +120,69 @@ export async function hashBriefStory(story) {
  }
  return hex.slice(0, 16);
 }
+
+// ── Analyst-path prompt v2 (multi-sentence, grounded) ──────────────────────
+//
+// Shadow-diff on 12 prod stories (2026-04-21) showed the v1 analyst output
+// was indistinguishable from the legacy Gemini-only output: identical
+// single-sentence abstraction-speak ("destabilize / systemic / sovereign
+// risk repricing") with no named actors, metrics, or dates. Root cause:
+// the 18–30 word cap compressed the context's specifics out of the LLM's
+// response. v2 loosens to 40–70 words across 2–3 sentences and REQUIRES
+// the LLM to ground at least one specific reference from the live context.
+
+/**
+ * System prompt for the analyst-path v2 (2–3 sentences, ~40–70 words,
+ * grounded in a specific named actor / metric / date / place drawn
+ * from the live context). Shape nudged toward the WMAnalyst chat voice
+ * (SITUATION → ANALYSIS → optional WATCH) but rendered as plain prose,
+ * no section labels in the output.
+ */
+export const WHY_MATTERS_ANALYST_SYSTEM_V2 =
+  'You are the lead analyst at WorldMonitor Brief, a geopolitical intelligence magazine. ' +
+  'Using the Live WorldMonitor Context AND the story, write 2–3 sentences (40–70 words total) ' +
+  'on why the story matters.\n\n' +
+  'STRUCTURE:\n' +
+  '1. SITUATION — what is happening right now, grounded in a SPECIFIC named actor, ' +
+  'metric, date, or place drawn from the context.\n' +
+  '2. ANALYSIS — the structural consequence (why this forces a repricing, shifts ' +
+  'the balance, triggers a cascade).\n' +
+  '3. (Optional) WATCH — the threshold or indicator to track, if clear from the context.\n\n' +
+  'HARD CONSTRAINTS:\n' +
+  '- Total length 40–70 words across 2–3 sentences.\n' +
+  '- MUST reference at least ONE specific: named person / country / organization / ' +
+  'number / percentage / date / city — drawn from the context, NOT invented.\n' +
+  '- No preamble ("This matters because…", "The importance of…").\n' +
+  '- No markdown, no bullet points, no section labels in the output — plain prose.\n' +
+  '- Editorial, impersonal, serious. No calls to action, no questions, no quotes.';
+
+/**
+ * Parse + validate the analyst-path v2 LLM response. Accepts
+ * multi-sentence output (2–3 sentences), 100–500 chars. Otherwise
+ * same rejection semantics as v1 (stub echo, empty) plus explicit
+ * rejection of preamble boilerplate and leaked section labels.
+ *
+ * Returns null when the output is obviously wrong so the caller can
+ * fall through to the next layer.
+ *
+ * @param {unknown} text
+ * @returns {string | null}
+ */
+export function parseWhyMattersV2(text) {
+  if (typeof text !== 'string') return null;
+  let s = text.trim();
+  if (!s) return null;
+  // Drop surrounding quotes if the model insisted.
+  s = s.replace(/^[\u201C"']+/, '').replace(/[\u201D"']+$/, '').trim();
+  if (s.length < 100 || s.length > 500) return null;
+  // Reject the stub echo (same as v1).
+  if (/^story flagged by your sensitivity/i.test(s)) return null;
+  // Reject common preamble the system prompt explicitly banned.
+  if (/^(this matters because|the importance of|it is important|importantly,|in summary,|to summarize)/i.test(s)) {
+    return null;
+  }
+  // Reject markdown / section-label leakage (we told it to use plain prose).
+  if (/^(#|-|\*|\d+\.\s)/.test(s)) return null;
+  if (/^(situation|analysis|watch)\s*[:\-–—]/i.test(s)) return null;
+  return s;
+}
--- a/server/worldmonitor/intelligence/v1/brief-why-matters-prompt.ts
+++ b/server/worldmonitor/intelligence/v1/brief-why-matters-prompt.ts
@@ -12,7 +12,7 @@
 * LLM latency predictable.
 */

-import { WHY_MATTERS_SYSTEM } from '../../../../shared/brief-llm-core.js';
+import { WHY_MATTERS_ANALYST_SYSTEM_V2 } from '../../../../shared/brief-llm-core.js';
 import { sanitizeForPrompt } from '../../../_shared/llm-sanitize.js';
 import type { BriefStoryContext } from './brief-story-context';

@@ -22,6 +22,9 @@ export interface StoryForPrompt {
  threatLevel: string;
  category: string;
  country: string;
+  /** Optional story description; included when the cron has already
+   *  resolved it (post-describe pipeline). Absent on first-pass calls. */
+  description?: string;
 }

 /**
@@ -38,6 +41,9 @@ export function sanitizeStoryFields(story: StoryForPrompt): StoryForPrompt {
    threatLevel: sanitizeForPrompt(story.threatLevel),
    category: sanitizeForPrompt(story.category),
    country: sanitizeForPrompt(story.country),
+    ...(typeof story.description === 'string' && story.description.length > 0
+      ? { description: sanitizeForPrompt(story.description) }
+      : {}),
  };
 }

@@ -103,23 +109,34 @@ export function buildAnalystWhyMattersPrompt(
  const safe = sanitizeStoryFields(story);
  const contextBlock = buildContextBlock(context);

-  const storyLines = [
+  const storyLineList = [
    `Headline: ${safe.headline}`,
+    ...(safe.description ? [`Description: ${safe.description}`] : []),
    `Source: ${safe.source}`,
    `Severity: ${safe.threatLevel}`,
    `Category: ${safe.category}`,
    `Country: ${safe.country}`,
-  ].join('\n');
+  ];
+  const storyLines = storyLineList.join('\n');

  const sections = [];
  if (contextBlock) {
    sections.push('# Live WorldMonitor Context', contextBlock);
  }
  sections.push('# Story', storyLines);
-  sections.push('One editorial sentence on why this matters:');
+  // Prompt footer matches the system prompt's SITUATION → ANALYSIS →
+  // (optional) WATCH arc, but explicitly restates the grounding
+  // requirement so the model can't ignore it from the system message
+  // alone. Models follow inline instructions more reliably than
+  // system-prompt constraints on longer outputs.
+  sections.push(
+    'Write 2–3 sentences (40–70 words) on why this story matters, grounded in at ' +
+      'least ONE specific actor / metric / date / place drawn from the context above. ' +
+      'Plain prose, no section labels in the output:',
+  );

  return {
-    system: WHY_MATTERS_SYSTEM,
+    system: WHY_MATTERS_ANALYST_SYSTEM_V2,
    user: sections.join('\n\n'),
  };
 }
--- a/shared/brief-llm-core.d.ts
+++ b/shared/brief-llm-core.d.ts
@@ -4,6 +4,9 @@ export interface BriefStoryHashInput {
  threatLevel?: string;
  category?: string;
  country?: string;
+  /** v5: part of cache identity so same-story + different description
+   *  don't collide on cached analyst output. */
+  description?: string;
 }

 export interface BriefStoryPromptInput {
@@ -24,3 +27,7 @@ export function buildWhyMattersUserPrompt(story: BriefStoryPromptInput): {
 export function parseWhyMatters(text: unknown): string | null;

 export function hashBriefStory(story: BriefStoryHashInput): Promise<string>;
+
+// ── v2 (analyst path only) ────────────────────────────────────────────────
+export const WHY_MATTERS_ANALYST_SYSTEM_V2: string;
+export function parseWhyMattersV2(text: unknown): string | null;
--- a/shared/brief-llm-core.js
+++ b/shared/brief-llm-core.js
@@ -69,11 +69,20 @@ export function parseWhyMatters(text) {
 }

 /**
- * Deterministic 16-char hex hash of the five story fields that flow
- * into the whyMatters prompt. Same material as the pre-v3 sync
- * implementation (`scripts/lib/brief-llm.mjs:hashBriefStory`) — a
- * fixed fixture in tests/brief-llm-core.test.mjs pins the output so a
- * future refactor cannot silently invalidate every cached entry.
+ * Deterministic 16-char hex hash of the SIX story fields that flow
+ * into the whyMatters prompt (5 core + description). Cache identity
+ * MUST cover every field that shapes the LLM output, or two requests
+ * with the same core fields but different descriptions will share a
+ * cache entry and the second caller gets prose grounded in the first
+ * caller's description (P1 regression caught in PR #3269 review).
+ *
+ * History:
+ *   - pre-v3: 5 fields, sync `node:crypto.createHash`.
+ *   - v3: moved to Web Crypto (async), same 5 fields.
+ *   - v5 (with endpoint cache bump to brief:llm:whymatters:v5:):
+ *     6 fields — `description` added to match the analyst path's
+ *     v2 prompt which interpolates `Description: <desc>` between
+ *     headline and source.
 *
 * Uses Web Crypto so the module is edge-safe. Returns a Promise because
 * `crypto.subtle.digest` is async; cron call sites are already in an
@@ -85,6 +94,7 @@ export function parseWhyMatters(text) {
 *   threatLevel?: string;
 *   category?: string;
 *   country?: string;
+ *   description?: string;
 * }} story
 * @returns {Promise<string>}
 */
@@ -95,6 +105,11 @@ export async function hashBriefStory(story) {
    story.threatLevel ?? '',
    story.category ?? '',
    story.country ?? '',
+    // New in v5: description is a prompt input on the analyst path,
+    // so MUST be part of cache identity. Absent on legacy paths →
+    // empty string → deterministic; same-story-same-description pairs
+    // still collide on purpose, different descriptions don't.
+    story.description ?? '',
  ].join('||');
  const bytes = new TextEncoder().encode(material);
  const digest = await crypto.subtle.digest('SHA-256', bytes);
@@ -105,3 +120,69 @@ export async function hashBriefStory(story) {
  }
  return hex.slice(0, 16);
 }
+
+// ── Analyst-path prompt v2 (multi-sentence, grounded) ──────────────────────
+//
+// Shadow-diff on 12 prod stories (2026-04-21) showed the v1 analyst output
+// was indistinguishable from the legacy Gemini-only output: identical
+// single-sentence abstraction-speak ("destabilize / systemic / sovereign
+// risk repricing") with no named actors, metrics, or dates. Root cause:
+// the 18–30 word cap compressed the context's specifics out of the LLM's
+// response. v2 loosens to 40–70 words across 2–3 sentences and REQUIRES
+// the LLM to ground at least one specific reference from the live context.
+
+/**
+ * System prompt for the analyst-path v2 (2–3 sentences, ~40–70 words,
+ * grounded in a specific named actor / metric / date / place drawn
+ * from the live context). Shape nudged toward the WMAnalyst chat voice
+ * (SITUATION → ANALYSIS → optional WATCH) but rendered as plain prose,
+ * no section labels in the output.
+ */
+export const WHY_MATTERS_ANALYST_SYSTEM_V2 =
+  'You are the lead analyst at WorldMonitor Brief, a geopolitical intelligence magazine. ' +
+  'Using the Live WorldMonitor Context AND the story, write 2–3 sentences (40–70 words total) ' +
+  'on why the story matters.\n\n' +
+  'STRUCTURE:\n' +
+  '1. SITUATION — what is happening right now, grounded in a SPECIFIC named actor, ' +
+  'metric, date, or place drawn from the context.\n' +
+  '2. ANALYSIS — the structural consequence (why this forces a repricing, shifts ' +
+  'the balance, triggers a cascade).\n' +
+  '3. (Optional) WATCH — the threshold or indicator to track, if clear from the context.\n\n' +
+  'HARD CONSTRAINTS:\n' +
+  '- Total length 40–70 words across 2–3 sentences.\n' +
+  '- MUST reference at least ONE specific: named person / country / organization / ' +
+  'number / percentage / date / city — drawn from the context, NOT invented.\n' +
+  '- No preamble ("This matters because…", "The importance of…").\n' +
+  '- No markdown, no bullet points, no section labels in the output — plain prose.\n' +
+  '- Editorial, impersonal, serious. No calls to action, no questions, no quotes.';
+
+/**
+ * Parse + validate the analyst-path v2 LLM response. Accepts
+ * multi-sentence output (2–3 sentences), 100–500 chars. Otherwise
+ * same rejection semantics as v1 (stub echo, empty) plus explicit
+ * rejection of preamble boilerplate and leaked section labels.
+ *
+ * Returns null when the output is obviously wrong so the caller can
+ * fall through to the next layer.
+ *
+ * @param {unknown} text
+ * @returns {string | null}
+ */
+export function parseWhyMattersV2(text) {
+  if (typeof text !== 'string') return null;
+  let s = text.trim();
+  if (!s) return null;
+  // Drop surrounding quotes if the model insisted.
+  s = s.replace(/^[\u201C"']+/, '').replace(/[\u201D"']+$/, '').trim();
+  if (s.length < 100 || s.length > 500) return null;
+  // Reject the stub echo (same as v1).
+  if (/^story flagged by your sensitivity/i.test(s)) return null;
+  // Reject common preamble the system prompt explicitly banned.
+  if (/^(this matters because|the importance of|it is important|importantly,|in summary,|to summarize)/i.test(s)) {
+    return null;
+  }
+  // Reject markdown / section-label leakage (we told it to use plain prose).
+  if (/^(#|-|\*|\d+\.\s)/.test(s)) return null;
+  if (/^(situation|analysis|watch)\s*[:\-–—]/i.test(s)) return null;
+  return s;
+}
--- a/tests/brief-llm-core.test.mjs
+++ b/tests/brief-llm-core.test.mjs
@@ -23,8 +23,9 @@ import {
  parseWhyMatters,
 } from '../shared/brief-llm-core.js';

-// Pre-extract sync impl, kept inline so the parity test can't drift from
-// what the cron used to emit.
+// Mirror impl (sync `node:crypto`) — kept inline so a drift between
+// the Web Crypto implementation and this sentinel fails the parity
+// test here first. Must include `description` to match v5 semantics.
 function legacyHashBriefStory(story) {
  const material = [
    story.headline ?? '',
@@ -32,6 +33,7 @@ function legacyHashBriefStory(story) {
    story.threatLevel ?? '',
    story.category ?? '',
    story.country ?? '',
+    story.description ?? '',
  ].join('||');
  return createHash('sha256').update(material).digest('hex').slice(0, 16);
 }
@@ -74,6 +76,30 @@ describe('hashBriefStory — Web Crypto parity with legacy node:crypto', () => {
    }
  });

+  it('description is part of cache identity (v5 regression guard)', async () => {
+    // Pinned from PR #3269 review P1: adding `description` to the
+    // analyst prompt without adding it to the hash caused same-story-
+    // diff-description to collide on one cache entry, so callers got
+    // prose grounded in a PREVIOUS caller's description.
+    const withDescA = {
+      ...FIXTURE,
+      description: 'Tehran publicly reopened commercial shipping.',
+    };
+    const withDescB = {
+      ...FIXTURE,
+      description: 'Iran formally blockaded outbound tankers.',
+    };
+    const noDesc = { ...FIXTURE };
+
+    const hashA = await hashBriefStory(withDescA);
+    const hashB = await hashBriefStory(withDescB);
+    const hashNone = await hashBriefStory(noDesc);
+
+    assert.notEqual(hashA, hashB, 'different descriptions must produce different hashes');
+    assert.notEqual(hashA, hashNone, 'description present vs absent must differ');
+    assert.notEqual(hashB, hashNone);
+  });
+
  it('treats missing fields as empty strings (backcompat)', async () => {
    const partial = { headline: FIXTURE.headline };
    const expected = legacyHashBriefStory(partial);
@@ -140,3 +166,78 @@ describe('parseWhyMatters — pure sentence validator', () => {
    assert.equal(parseWhyMatters(s), s);
  });
 });
+
+describe('parseWhyMattersV2 — multi-sentence, analyst-path only', () => {
+  it('lazy-loads', async () => {
+    const mod = await import('../shared/brief-llm-core.js');
+    assert.equal(typeof mod.parseWhyMattersV2, 'function');
+  });
+
+  it('accepts 2–3 sentences totalling 100–500 chars', async () => {
+    const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
+    const good =
+      "Iran's closure of the Strait of Hormuz on April 21 halts roughly 20% of global seaborne oil. " +
+      'The disruption forces an immediate repricing of sovereign risk across Gulf energy exporters. ' +
+      'Watch IMF commentary in the next 48 hours for cascading guidance.';
+    assert.ok(good.length >= 100 && good.length <= 500);
+    assert.equal(parseWhyMattersV2(good), good);
+  });
+
+  it('rejects <100 chars (too terse for the analyst contract)', async () => {
+    const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
+    assert.equal(parseWhyMattersV2('Short.'), null);
+    assert.equal(parseWhyMattersV2('x'.repeat(99)), null);
+  });
+
+  it('rejects >500 chars (runaway generation)', async () => {
+    const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
+    assert.equal(parseWhyMattersV2('a'.repeat(501)), null);
+  });
+
+  it('rejects preamble the system prompt banned', async () => {
+    const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
+    const cases = [
+      'This matters because global energy markets depend on the Strait of Hormuz remaining open for transit and this is therefore a critical development.',
+      'The importance of this development cannot be overstated given the potential for cascading economic impacts across multiple regions and industries.',
+      'It is important to note that the ongoing situation in the Strait of Hormuz has implications that extend far beyond simple maritime concerns.',
+      'Importantly, the developments in the Strait of Hormuz today signal a shift in regional dynamics that could reshape global energy markets for months.',
+      'In summary, the current situation presents significant risks to global stability and requires careful monitoring of diplomatic and military channels.',
+      'To summarize the situation, the Strait of Hormuz developments represent a critical juncture in regional power dynamics with broad implications.',
+    ];
+    for (const c of cases) {
+      assert.ok(c.length >= 100 && c.length <= 500);
+      assert.equal(parseWhyMattersV2(c), null, `should reject preamble: ${c.slice(0, 40)}…`);
+    }
+  });
+
+  it('rejects markdown / leaked section labels the prompt told it to omit', async () => {
+    const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
+    const cases = [
+      '# Situation\nIran closed the strait on April 21, halting 20% of seaborne oil. Analysis: sovereign risk repricing follows immediately for Gulf exporters.',
+      '- Bullet one that should not open the response at all given the plain-prose rule in the system message.\n- Bullet two of the banned response.',
+      '* Leading bullet with asterisk that should also trip the markdown rejection because analyst prose should be plain paragraphs across 2–3 sentences.',
+      '1. Numbered point opening the response is equally banned by the system prompt requiring plain prose across two to three sentences with grounded references.',
+      'SITUATION: Iran closed Hormuz today. ANALYSIS: cascading sovereign repricing follows. WATCH: IMF Gulf commentary in 48h. This mirrors the 2019 pattern.',
+      'Analysis — the Strait closure triggers a cascading sovereign risk repricing across Gulf exporters with immediate effect on global markets and shipping lanes.',
+    ];
+    for (const c of cases) {
+      assert.equal(parseWhyMattersV2(c), null, `should reject leaked label: ${c.slice(0, 40)}…`);
+    }
+  });
+
+  it('still rejects the stub echo', async () => {
+    const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
+    const stub =
+      'Story flagged by your sensitivity settings. Open for context. This stub is long enough to clear the 100-char floor but must still be rejected as non-enrichment output.';
+    assert.equal(parseWhyMattersV2(stub), null);
+  });
+
+  it('strips surrounding smart-quotes before validation', async () => {
+    const { parseWhyMattersV2 } = await import('../shared/brief-llm-core.js');
+    const raw =
+      '\u201CIran closed the Strait on April 21, halting 20% of seaborne oil. The disruption forces an immediate repricing of sovereign risk across Gulf exporters.\u201D';
+    const out = parseWhyMattersV2(raw);
+    assert.ok(out && !out.startsWith('\u201C'));
+    assert.ok(out && !out.endsWith('\u201D'));
+  });
+});
--- a/tests/brief-why-matters-analyst.test.mjs
+++ b/tests/brief-why-matters-analyst.test.mjs
@@ -204,10 +204,9 @@ describe('generateWhyMatters — analyst priority', () => {
    assert.equal(callLlmInvoked, true, 'legacy callLLM must fire after analyst miss');
  });

-  it('falls through when analyst returns unparseable prose (parser rejection)', async () => {
+  it('falls through when analyst returns out-of-bounds output (too short)', async () => {
    let callLlmInvoked = false;
    const out = await generateWhyMatters(story(), {
-      // Too short — fails parseWhyMatters length gate (< 30 chars).
      callAnalystWhyMatters: async () => 'Short.',
      callLLM: async () => {
        callLlmInvoked = true;
@@ -217,7 +216,33 @@ describe('generateWhyMatters — analyst priority', () => {
      cacheSet: async () => {},
    });
    assert.equal(out, VALID);
-    assert.equal(callLlmInvoked, true, 'unparseable analyst output must trigger fallback');
+    assert.equal(callLlmInvoked, true, 'out-of-bounds analyst output must trigger fallback');
+  });
+
+  it('preserves multi-sentence v2 analyst output verbatim (P1 regression guard)', async () => {
+    // The endpoint now returns 2–3 sentences validated by parseWhyMattersV2.
+    // The cron MUST NOT reparse with the v1 single-sentence parser, which
+    // would silently truncate the 2nd + 3rd sentences. Caught in PR #3269
+    // review; fixed by trusting the endpoint's own validation and only
+    // rejecting obvious garbage (length / stub echo) here.
+    const multi =
+      "Iran's closure of the Strait of Hormuz on April 21 halts roughly 20% of global seaborne oil. " +
+      'The disruption forces an immediate repricing of sovereign risk across Gulf energy exporters. ' +
+      'Watch IMF commentary in the next 48 hours for cascading guidance.';
+    let callLlmInvoked = false;
+    const out = await generateWhyMatters(story(), {
+      callAnalystWhyMatters: async () => multi,
+      callLLM: async () => {
+        callLlmInvoked = true;
+        return VALID;
+      },
+      cacheGet: async () => null,
+      cacheSet: async () => {},
+    });
+    assert.equal(out, multi, 'multi-sentence v2 output must reach the envelope unchanged');
+    assert.equal(callLlmInvoked, false, 'legacy callLLM must not fire when v2 analyst succeeds');
+    // Sanity: output is actually multi-sentence (not truncated to first).
+    assert.ok(out.split('. ').length >= 2, 'output must retain 2nd+ sentences');
  });

  it('falls through when analyst throws', async () => {
@@ -359,7 +384,8 @@ describe('buildAnalystWhyMattersPrompt — shape and budget', () => {
    assert.ok(typeof builder === 'function');
  });

-  it('reuses WHY_MATTERS_SYSTEM verbatim', () => {
+  it('uses the analyst v2 system prompt (multi-sentence, grounded)', async () => {
+    const { WHY_MATTERS_ANALYST_SYSTEM_V2 } = await import('../shared/brief-llm-core.js');
    const { system } = builder(story(), {
      worldBrief: 'X',
      countryBrief: '',
@@ -369,10 +395,13 @@ describe('buildAnalystWhyMattersPrompt — shape and budget', () => {
      macroSignals: '',
      degraded: false,
    });
-    assert.equal(system, WHY_MATTERS_SYSTEM);
+    assert.equal(system, WHY_MATTERS_ANALYST_SYSTEM_V2);
+    // Contract must still mention the 40–70 word target + grounding rule.
+    assert.match(system, /40–70 words/);
+    assert.match(system, /named person \/ country \/ organization \/ number \/ percentage \/ date \/ city/);
  });

-  it('includes the story fields in the same 5-line format', () => {
+  it('includes story fields with the multi-sentence footer', () => {
    const { user } = builder(story(), {
      worldBrief: '',
      countryBrief: '',
@@ -387,7 +416,38 @@ describe('buildAnalystWhyMattersPrompt — shape and budget', () => {
    assert.match(user, /Severity: critical/);
    assert.match(user, /Category: Geopolitical Risk/);
    assert.match(user, /Country: IR/);
-    assert.match(user, /One editorial sentence on why this matters:$/);
+    assert.match(user, /Write 2–3 sentences \(40–70 words\)/);
+    assert.match(user, /grounded in at least ONE specific/);
+  });
+
+  it('includes story description when present', () => {
+    const storyWithDesc = {
+      ...story(),
+      description: 'Tehran publicly reopened the Strait of Hormuz to commercial shipping today.',
+    };
+    const { user } = builder(storyWithDesc, {
+      worldBrief: '',
+      countryBrief: '',
+      riskScores: '',
+      forecasts: '',
+      marketData: '',
+      macroSignals: '',
+      degraded: false,
+    });
+    assert.match(user, /Description: Tehran publicly reopened/);
+  });
+
+  it('omits description line when field absent', () => {
+    const { user } = builder(story(), {
+      worldBrief: '',
+      countryBrief: '',
+      riskScores: '',
+      forecasts: '',
+      marketData: '',
+      macroSignals: '',
+      degraded: false,
+    });
+    assert.doesNotMatch(user, /Description:/);
  });

  it('omits context block when all fields empty', () => {