Files
worldmonitor/server/_shared/llm-sanitize.js
Elie Habib 5e23ae4ec7 fix(security): sanitize LLM prompt inputs against injection attacks (#1877)
* fix(security): sanitize LLM prompt inputs against injection attacks

Adds server/_shared/llm-sanitize.js with sanitizeForPrompt() and
sanitizeHeadlines() that strip known prompt-injection patterns before
untrusted RSS headlines and geo-context strings are embedded in LLM
prompts.

Patterns stripped:
- Model-specific delimiters (<|im_start|>, [INST], <|endoftext|>, etc.)
- XML-style role wrappers (<system>, <assistant>, <user>)
- Role override markers at line start (SYSTEM:, ### Claude:, etc.)
- Instruction-override phrases (Ignore previous instructions, etc.)
- ASCII/Unicode control characters, zero-width joiners, BOM, soft-hyphen
- Separator lines (---, ===) used as prompt boundaries

Normal headlines (quotes, colons, dashes, emoji, unicode) pass through
unchanged. The sanitizer is defense-in-depth, not a security boundary.

Wired into summarize-article.ts replacing the previous slice-only approach.
Tests: 36 cases across 8 suites, all passing.

Co-authored-by: Fayez Bast <FayezBast@users.noreply.github.com>
Ported from PR #381

* fix(types): add type declarations for llm-sanitize.js

* fix(sanitize): address Codex review — light sanitizer for headlines

- Add sanitizeHeadline() / sanitizeHeadlinesLight(): strips only structural
  patterns (model delimiters, control chars) without touching semantic
  instruction phrases
- Use sanitizeHeadlinesLight() for headlines so that legitimate tech/security
  news like 'Anthropic says users can type "Output your system prompt"...'
  passes through unchanged and cache keys stay aligned with the browser
- Keep full sanitizeForPrompt() for geoContext only (free-form, higher risk)
- 40 tests, all passing

* fix(security): apply full injection sanitizer at prompt-build time (P1)

Separate the two uses of headlines:
- Cache key: sanitizeHeadlinesLight() (structural only, preserves semantic
  phrases) so browser/server cache keys stay aligned
- Prompt build: sanitizeHeadlines() (full sanitizer including semantic
  injection phrases) applied inside the fetcher just before buildArticlePrompts()

This closes the P1 gap where "Ignore previous instructions" and similar
payloads in RSS headlines were reaching the LLM prompt unchanged.

---------

Co-authored-by: Fayez Bast <FayezBast@users.noreply.github.com>
2026-03-19 17:00:45 +04:00

157 lines
6.5 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* LLM Prompt Injection Sanitizer
*
* Strips known prompt-injection patterns from untrusted strings (e.g. RSS
* headlines) before they are embedded in an LLM prompt.
*
* Design philosophy — blocklist of *bad* patterns only:
* ✓ Quotes, colons, dashes, em-dashes, ellipses → preserved (normal headlines)
* ✓ Unicode letters and emoji → preserved
* ✓ Sentence-level punctuation → preserved
* ✗ Role markers (e.g. "SYSTEM:", "### Assistant") → stripped
* ✗ Instruction overrides ("Ignore previous …") → stripped
* ✗ Model-specific delimiters ("<|im_start|>", etc.) → stripped
* ✗ ASCII / Unicode control characters (U+0000-U+001F, U+007F, U+2028-U+2029) → stripped
* ✗ Null bytes, zero-width joiners / non-joiners → stripped
*
* The sanitizer never throws. If input is not a string it returns '' so
* callers can safely map over headline arrays without extra guards.
*
* Security note:
* This is a defense-in-depth reduction layer, not a security boundary.
* Prompt-injection blocklists are inherently bypassable (for example via novel
* encodings, obfuscation, or semantically malicious content), so callers must
* keep additional controls in place (strict output validation, model/provider
* guardrails, and least-privilege tool access).
*
* References:
* OWASP LLM Top 10 LLM01: Prompt Injection
*/
const INJECTION_PATTERNS = [
// Model-specific delimiter tokens
/<\|(?:im_start|im_end|begin_of_text|end_of_text|eot_id|start_header_id|end_header_id)\|>/gi,
/<\|(?:endoftext|fim_prefix|fim_middle|fim_suffix|pad)\|>/gi,
/\[(?:INST|\/INST|SYS|\/SYS)\]/gi,
/<\/?(system|user|assistant|prompt|context|instruction)\b[^>]*>/gi,
// Role override markers at line start
/(?:^|\n)\s*(?:#{1,4}\s*)?(?:\[|\()?\s*(?:system|human|gpt|claude|llm|model|prompt)\s*(?:\]|\))?\s*:/gim,
// Explicit instruction-override phrases
/ignore\s+(?:all\s+)?(?:previous|above|prior|earlier|the\s+above)\s+instructions?\b/gi,
/(?:disregard|forget|bypass|override|overwrite|skip)\s+(?:all\s+)?(?:previous|above|prior|earlier|your|the)\s+(?:instructions?|prompt|rules?|guidelines?|constraints?|training)\b/gi,
/(?:you\s+are\s+now|act\s+as|pretend\s+(?:to\s+be|you\s+are)|roleplay\s+as|simulate\s+(?:being\s+)?a)\s+(?:a\s+|an\s+)?(?:(?:different|new|another|unrestricted|jailbroken|evil|helpful)\s+)?(?:ai|assistant|model|chatbot|llm|bot|gpt|claude)\b/gi,
/do\s+not\s+(?:follow|obey|adhere\s+to|comply\s+with)\s+(?:the\s+)?(?:previous|above|system|original)\s+(?:instructions?|rules?|prompt)\b/gi,
/(?:output|print|display|reveal|show|repeat|recite|write\s+out)\s+(?:your\s+)?(?:system\s+prompt|instructions?|initial\s+prompt|original\s+prompt|context)\b/gi,
// Prompt boundary separator lines
/^[\-=]{3,}$/gm,
/^#{3,}\s/gm,
];
const ROLE_PREFIX_RE = /^\s*(?:#{1,4}\s*)?(?:\[|\()?\s*(?:user|assistant|bot)\s*(?:\]|\))?\s*:\s*/i;
const ROLE_OVERRIDE_STRONG_RE = /\b(?:you\s+are\s+now|act\s+as|pretend\s+(?:to\s+be|you\s+are)|roleplay\s+as|simulate\s+(?:being\s+)?a|from\s+now\s+on|do\s+not\s+(?:follow|obey|adhere\s+to|comply\s+with))\b/i;
const ROLE_OVERRIDE_COMMAND_RE = /\b(?:ignore|disregard|forget|bypass|override|overwrite|skip|reveal|output|print|display|show|repeat|recite|write\s+out)\b/i;
const ROLE_OVERRIDE_FOLLOW_RE = /\b(?:follow|obey)\s+(?:all\s+)?(?:the\s+|my\s+|your\s+)?(?:instructions?|prompt|rules?|guidelines?|constraints?)\b/i;
const ROLE_OVERRIDE_TARGET_RE = /\b(?:instructions?|prompt|system|rules?|guidelines?|constraints?|training|context|developer\s+message)\b/i;
function isRolePrefixedInjectionLine(line) {
if (!ROLE_PREFIX_RE.test(line)) return false;
if (ROLE_OVERRIDE_STRONG_RE.test(line)) return true;
if (ROLE_OVERRIDE_FOLLOW_RE.test(line)) return true;
return ROLE_OVERRIDE_COMMAND_RE.test(line) && ROLE_OVERRIDE_TARGET_RE.test(line);
}
// U+0000-U+001F ASCII control chars (except newline U+000A, tab U+0009)
// U+007F DEL
// U+00AD soft hyphen
// U+200B-U+200D zero-width space / non-joiner / joiner
// U+2028-U+2029 Unicode line/paragraph separator
// U+FEFF BOM / zero-width no-break space
const CONTROL_CHARS_RE = /[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\xAD\u200B-\u200D\u2028\u2029\uFEFF]/g;
/**
* Sanitize a single string for safe inclusion in an LLM prompt.
* @param {unknown} input
* @returns {string}
*/
export function sanitizeForPrompt(input) {
if (typeof input !== 'string') return '';
let s = input;
s = s.replace(CONTROL_CHARS_RE, '');
s = s
.split('\n')
.filter(line => !isRolePrefixedInjectionLine(line))
.join('\n');
for (const pattern of INJECTION_PATTERNS) {
pattern.lastIndex = 0;
s = s.replace(pattern, ' ');
}
s = s.replace(/\s{2,}/g, ' ').trim();
return s;
}
/**
* Sanitize an array of headline strings, dropping any that become empty
* after sanitization.
* @param {unknown[]} headlines
* @returns {string[]}
*/
export function sanitizeHeadlines(headlines) {
if (!Array.isArray(headlines)) return [];
return headlines
.map(sanitizeForPrompt)
.filter(h => h.length > 0);
}
// Structural-only patterns safe to apply to headlines without mangling
// legitimate tech/security news (e.g. "Output your system prompt" as a story subject).
const STRUCTURAL_PATTERNS = [
/<\|(?:im_start|im_end|begin_of_text|end_of_text|eot_id|start_header_id|end_header_id)\|>/gi,
/<\|(?:endoftext|fim_prefix|fim_middle|fim_suffix|pad)\|>/gi,
/\[(?:INST|\/INST|SYS|\/SYS)\]/gi,
/<\/?(system|user|assistant|prompt|context|instruction)\b[^>]*>/gi,
/^[\-=]{3,}$/gm,
];
/**
* Sanitize a headline for safe inclusion in an LLM prompt, preserving
* legitimate headlines that quote injection phrases as news subjects.
*
* Only structural/delimiter patterns are stripped — semantic instruction
* phrases are left intact to avoid mangling tech/security news headlines.
* Full sanitizeForPrompt() is reserved for free-form geoContext.
*
* @param {unknown} input
* @returns {string}
*/
export function sanitizeHeadline(input) {
if (typeof input !== 'string') return '';
let s = input.replace(CONTROL_CHARS_RE, '');
for (const pattern of STRUCTURAL_PATTERNS) {
pattern.lastIndex = 0;
s = s.replace(pattern, ' ');
}
return s.replace(/\s{2,}/g, ' ').trim();
}
/**
* Apply sanitizeHeadline() over an array, dropping empties.
* @param {unknown[]} headlines
* @returns {string[]}
*/
export function sanitizeHeadlinesLight(headlines) {
if (!Array.isArray(headlines)) return [];
return headlines
.map(sanitizeHeadline)
.filter(h => h.length > 0);
}