fix(sanctions): replace fast-xml-parser with SAX streaming to fix Railway OOM (#2008)

The seed was being SIGKILL'd on Railway (512MB limit) because fast-xml-parser
built a ~300MB object tree from the 120MB OFAC SDN XML download, causing both
the raw XML string and the full parsed object to coexist in heap simultaneously.

Switch to sax (already a transitive dep) with a streaming pipeline: response.body
is piped chunk-by-chunk via a TextDecoder into the SAX parser. The full XML
string is never held in memory. Reference maps (areaCodes, featureTypes,
legalBasis, locations, parties) are populated as SAX events arrive, and entries
are emitted one at a time on </SanctionsEntry>.

All DOM-traversal helpers (listify, textValue, buildEpoch, buildReferenceMaps,
buildLocationMap, extractPartyName, resolveEntityType, extractPartyCountries,
buildPartyMap, extractPrograms, extractEffectiveAt, extractNote,
buildEntriesForDocument) are removed. Output-stage pure functions (uniqueSorted,
compactNote, sortEntries, buildCountryPressure, buildProgramPressure) are kept.

Tests updated to match: removed test blocks for deleted DOM helpers, kept
coverage for the remaining pure functions (2173/2173 pass).
This commit is contained in:
Elie Habib
2026-03-21 20:26:02 +04:00
committed by GitHub
parent c68b06489b
commit 56f237c37f
3 changed files with 378 additions and 785 deletions

View File

@@ -1,6 +1,10 @@
#!/usr/bin/env node
import { XMLParser } from 'fast-xml-parser';
// SAX streaming parser: response.body is piped chunk-by-chunk into the parser.
// The full XML string is never held in memory, which avoids the OOM crash that
// occurred when fast-xml-parser tried to build a ~300MB object tree from a
// 120MB XML download against Railway's 512MB container limit.
import sax from 'sax';
import { CHROME_UA, loadEnvFile, runSeed, verifySeedKey } from './_seed-utils.mjs';
@@ -18,40 +22,14 @@ const OFAC_SOURCES = [
{ label: 'CONSOLIDATED', url: 'https://sanctionslistservice.ofac.treas.gov/api/PublicationPreview/exports/cons_advanced.xml' },
];
const XML_PARSER = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '',
removeNSPrefix: true,
parseTagValue: false,
trimValues: true,
});
function listify(value) {
if (Array.isArray(value)) return value;
return value == null ? [] : [value];
}
function textValue(value) {
if (value == null) return '';
if (typeof value === 'string') return value.trim();
if (typeof value === 'number' || typeof value === 'boolean') return String(value);
if (typeof value === 'object') {
if (typeof value['#text'] === 'string') return value['#text'].trim();
if (typeof value.NamePartValue === 'string') return value.NamePartValue.trim();
}
return '';
}
function buildEpoch(parts) {
const year = Number(parts?.Year || 0);
if (!year) return 0;
const month = Math.max(1, Number(parts?.Month || 1));
const day = Math.max(1, Number(parts?.Day || 1));
return Date.UTC(year, month - 1, day);
// Strip XML namespace prefix (e.g. "sanc:SanctionsEntry" → "SanctionsEntry")
function local(name) {
const colon = name.indexOf(':');
return colon === -1 ? name : name.slice(colon + 1);
}
function uniqueSorted(values) {
return [...new Set(values.filter(Boolean).map((value) => String(value).trim()).filter(Boolean))].sort((a, b) => a.localeCompare(b));
return [...new Set(values.filter(Boolean).map((v) => String(v).trim()).filter(Boolean))].sort((a, b) => a.localeCompare(b));
}
function compactNote(value) {
@@ -60,194 +38,6 @@ function compactNote(value) {
return note.length > 240 ? `${note.slice(0, 237)}...` : note;
}
function extractDocumentedName(documentedName) {
const parts = listify(documentedName?.DocumentedNamePart)
.map((part) => textValue(part?.NamePartValue))
.filter(Boolean);
if (parts.length > 0) return parts.join(' ');
return textValue(documentedName);
}
function normalizeDateOfIssue(value) {
const epoch = buildEpoch(value);
return Number.isFinite(epoch) ? epoch : 0;
}
function buildReferenceMaps(doc) {
const refs = doc?.ReferenceValueSets ?? {};
const areaCodes = new Map();
for (const area of listify(refs?.AreaCodeValues?.AreaCode)) {
areaCodes.set(String(area.ID || ''), {
code: textValue(area),
name: String(area.Description || '').trim(),
});
}
const featureTypes = new Map();
for (const feature of listify(refs?.FeatureTypeValues?.FeatureType)) {
featureTypes.set(String(feature.ID || ''), textValue(feature));
}
const legalBasis = new Map();
for (const basis of listify(refs?.LegalBasisValues?.LegalBasis)) {
legalBasis.set(String(basis.ID || ''), String(basis.LegalBasisShortRef || textValue(basis) || '').trim());
}
return { areaCodes, featureTypes, legalBasis };
}
function buildLocationMap(doc, areaCodes) {
const locations = new Map();
for (const location of listify(doc?.Locations?.Location)) {
const ids = listify(location?.LocationAreaCode).map((item) => String(item.AreaCodeID || ''));
const mapped = ids.map((id) => areaCodes.get(id)).filter(Boolean);
// Sort code/name as pairs so codes[i] always corresponds to names[i]
const pairs = [...new Map(mapped.map((item) => [item.code, item.name])).entries()]
.filter(([code]) => code.length > 0)
.sort(([a], [b]) => a.localeCompare(b));
locations.set(String(location.ID || ''), {
codes: pairs.map(([code]) => code),
names: pairs.map(([, name]) => name),
});
}
return locations;
}
function extractPartyName(profile) {
const identities = listify(profile?.Identity);
const aliases = identities.flatMap((identity) => listify(identity?.Alias));
const primaryAlias = aliases.find((alias) => alias?.Primary === 'true')
|| aliases.find((alias) => alias?.AliasTypeID === '1403')
|| aliases[0];
return extractDocumentedName(primaryAlias?.DocumentedName);
}
function resolveEntityType(profile, featureTypes) {
const subtype = String(profile?.PartySubTypeID || '');
if (subtype === '1') return 'SANCTIONS_ENTITY_TYPE_VESSEL';
if (subtype === '2') return 'SANCTIONS_ENTITY_TYPE_AIRCRAFT';
const featureNames = listify(profile?.Feature)
.map((feature) => featureTypes.get(String(feature?.FeatureTypeID || '')) || '')
.filter(Boolean);
if (featureNames.some((name) => /birth|citizenship|nationality/i.test(name))) {
return 'SANCTIONS_ENTITY_TYPE_INDIVIDUAL';
}
return 'SANCTIONS_ENTITY_TYPE_ENTITY';
}
function extractPartyCountries(profile, featureTypes, locations) {
// Use a Map to deduplicate by code while preserving code→name alignment
const seen = new Map();
for (const feature of listify(profile?.Feature)) {
const featureType = featureTypes.get(String(feature?.FeatureTypeID || '')) || '';
if (!/location/i.test(featureType)) continue;
const versions = listify(feature?.FeatureVersion);
for (const version of versions) {
const locationIds = listify(version?.VersionLocation).map((item) => String(item?.LocationID || ''));
for (const locationId of locationIds) {
const location = locations.get(locationId);
if (!location) continue;
location.codes.forEach((code, i) => {
if (code && !seen.has(code)) seen.set(code, location.names[i] ?? '');
});
}
}
}
const sorted = [...seen.entries()].sort(([a], [b]) => a.localeCompare(b));
return {
countryCodes: sorted.map(([c]) => c),
countryNames: sorted.map(([, n]) => n),
};
}
function buildPartyMap(doc, featureTypes, locations) {
const parties = new Map();
for (const distinctParty of listify(doc?.DistinctParties?.DistinctParty)) {
const profile = distinctParty?.Profile;
const profileId = String(profile?.ID || distinctParty?.FixedRef || '');
if (!profileId) continue;
parties.set(profileId, {
name: extractPartyName(profile),
entityType: resolveEntityType(profile, featureTypes),
...extractPartyCountries(profile, featureTypes, locations),
});
}
return parties;
}
function extractPrograms(entry) {
const directPrograms = listify(entry?.SanctionsMeasure)
.map((measure) => textValue(measure?.Comment))
.filter((value) => PROGRAM_CODE_RE.test(value));
return uniqueSorted(directPrograms);
}
function extractEffectiveAt(entry) {
const dates = [];
for (const event of listify(entry?.EntryEvent)) {
const epoch = buildEpoch(event?.Date);
if (epoch > 0) dates.push(epoch);
}
for (const measure of listify(entry?.SanctionsMeasure)) {
const epoch = buildEpoch(measure?.DatePeriod?.Start?.From || measure?.DatePeriod?.Start);
if (epoch > 0) dates.push(epoch);
}
return dates.length > 0 ? Math.max(...dates) : 0;
}
function extractNote(entry, legalBasis) {
const comments = listify(entry?.SanctionsMeasure)
.map((measure) => textValue(measure?.Comment))
.filter((value) => value && !PROGRAM_CODE_RE.test(value));
if (comments.length > 0) return compactNote(comments[0]);
const legal = listify(entry?.EntryEvent)
.map((event) => legalBasis.get(String(event?.LegalBasisID || '')) || '')
.filter(Boolean);
return compactNote(legal[0] || '');
}
function buildEntriesForDocument(doc, sourceLabel) {
const { areaCodes, featureTypes, legalBasis } = buildReferenceMaps(doc);
const locations = buildLocationMap(doc, areaCodes);
const parties = buildPartyMap(doc, featureTypes, locations);
const datasetDate = normalizeDateOfIssue(doc?.DateOfIssue);
const entries = [];
for (const entry of listify(doc?.SanctionsEntries?.SanctionsEntry)) {
const profileId = String(entry?.ProfileID || '');
const party = parties.get(profileId);
const name = party?.name || 'Unnamed designation';
const programs = extractPrograms(entry);
entries.push({
id: `${sourceLabel}:${String(entry?.ID || profileId || name)}`,
name,
entityType: party?.entityType || 'SANCTIONS_ENTITY_TYPE_ENTITY',
countryCodes: party?.countryCodes ?? [],
countryNames: party?.countryNames ?? [],
programs: programs.length > 0 ? programs : [sourceLabel],
sourceLists: [sourceLabel],
effectiveAt: String(extractEffectiveAt(entry)),
isNew: false,
note: extractNote(entry, legalBasis),
});
}
return { entries, datasetDate };
}
function sortEntries(a, b) {
return (Number(b.isNew) - Number(a.isNew))
|| (Number(b.effectiveAt) - Number(a.effectiveAt))
@@ -256,11 +46,9 @@ function sortEntries(a, b) {
function buildCountryPressure(entries) {
const map = new Map();
for (const entry of entries) {
const codes = entry.countryCodes.length > 0 ? entry.countryCodes : ['XX'];
const names = entry.countryNames.length > 0 ? entry.countryNames : ['Unknown'];
codes.forEach((code, index) => {
const key = `${code}:${names[index] || names[0] || 'Unknown'}`;
const current = map.get(key) || {
@@ -278,7 +66,6 @@ function buildCountryPressure(entries) {
map.set(key, current);
});
}
return [...map.values()]
.sort((a, b) => b.newEntryCount - a.newEntryCount || b.entryCount - a.entryCount || a.countryName.localeCompare(b.countryName))
.slice(0, 12);
@@ -286,7 +73,6 @@ function buildCountryPressure(entries) {
function buildProgramPressure(entries) {
const map = new Map();
for (const entry of entries) {
const programs = entry.programs.length > 0 ? entry.programs : ['UNSPECIFIED'];
for (const program of programs) {
@@ -296,12 +82,18 @@ function buildProgramPressure(entries) {
map.set(program, current);
}
}
return [...map.values()]
.sort((a, b) => b.newEntryCount - a.newEntryCount || b.entryCount - a.entryCount || a.program.localeCompare(b.program))
.slice(0, 12);
}
/**
* Stream-parse one OFAC Advanced XML source via SAX.
*
* Memory model: response.body chunks → sax.parser (stateful, O(1) RAM per chunk)
* → accumulate only the minimal data structures needed for output.
* Peak heap is proportional to the number of entries/parties, not the XML size.
*/
async function fetchSource(source) {
console.log(` Fetching OFAC ${source.label}...`);
const t0 = Date.now();
@@ -309,26 +101,363 @@ async function fetchSource(source) {
headers: { 'User-Agent': CHROME_UA },
signal: AbortSignal.timeout(OFAC_TIMEOUT_MS),
});
if (!response.ok) {
throw new Error(`OFAC ${source.label} HTTP ${response.status}`);
}
if (!response.ok) throw new Error(`OFAC ${source.label} HTTP ${response.status}`);
// Block-scope xml so the ~120MB string is eligible for GC as soon as
// XML_PARSER.parse() returns, before buildEntriesForDocument allocates
// the entry objects. Without this, both coexist and OOM the 512MB limit.
let parsed;
{
const xml = await response.text();
console.log(` ${source.label}: ${(xml.length / 1024).toFixed(0)}KB downloaded (${Date.now() - t0}ms)`);
parsed = XML_PARSER.parse(xml)?.Sanctions;
}
// Yield to let GC reclaim the xml string before the build phase
await new Promise((resolve) => setImmediate(resolve));
return new Promise((resolve, reject) => {
// strict=true: case-sensitive tag names. xmlns=false: we strip prefixes manually.
const parser = sax.parser(true, { trim: false, normalize: false });
if (!parsed) throw new Error(`OFAC ${source.label} parse returned no Sanctions root`);
const result = buildEntriesForDocument(parsed, source.label);
console.log(` ${source.label}: ${result.entries.length} entries parsed`);
return result;
// ── Reference maps (built first, small, kept for cross-reference) ──────────
const areaCodes = new Map(); // ID → { code, name }
const featureTypes = new Map(); // ID → label string
const legalBasis = new Map(); // ID → shortRef string
const locations = new Map(); // ID → { codes[], names[] }
const parties = new Map(); // profileId → { name, entityType, countryCodes[], countryNames[] }
const entries = [];
let datasetDate = 0;
let bytesReceived = 0;
// ── Element stack & text buffer ────────────────────────────────────────────
const stack = []; // local element names
let text = ''; // accumulated character data for current leaf
// ── Section flags ──────────────────────────────────────────────────────────
let inDateOfIssue = false;
let inAreaCodeValues = false;
let inFeatureTypeValues = false;
let inLegalBasisValues = false;
let inLocations = false;
let inDistinctParties = false;
let inSanctionsEntries = false;
// ── Current-object accumulators ────────────────────────────────────────────
// DateOfIssue
let doiYear = 0, doiMonth = 1, doiDay = 1;
// AreaCode / FeatureType / LegalBasis (reference value section)
let refId = '', refShortRef = '', refDescription = '';
// Location
let locId = '';
let locAreaCodeIds = null; // string[] | null
// DistinctParty / Profile
let partyFixedRef = '';
let profileId = '', profileSubTypeId = '';
let aliases = null; // Alias[]
let curAlias = null; // { primary, typeId, nameParts[] }
let inDocumentedName = false;
let namePartsBuf = null; // string[] collecting NamePartValue text
let profileFeatures = null; // Feature[]
let curFeature = null; // { featureTypeId, locationIds[] }
// SanctionsEntry
let entryId = '', entryProfileId = '';
let entryDates = null; // number[] (epochs from EntryEvent.Date)
let entryMeasureDates = null; // number[] (from SanctionsMeasure.DatePeriod)
let entryPrograms = null; // string[]
let entryNoteComments = null; // string[] (non-program comments)
let entryLegalIds = null; // string[] (LegalBasisID from EntryEvent)
// Date sub-elements (shared by multiple contexts)
let dateYear = 0, dateMonth = 1, dateDay = 1;
let inEntryEventDate = false;
let inMeasureDatePeriod = false;
// ── Helpers ────────────────────────────────────────────────────────────────
function epoch(y, m, d) {
if (!y) return 0;
return Date.UTC(y, Math.max(1, m) - 1, Math.max(1, d));
}
function resolveLocation(locId) {
const ids = locAreaCodeIds;
const mapped = ids.map((id) => areaCodes.get(id)).filter(Boolean);
const pairs = [...new Map(mapped.map((item) => [item.code, item.name])).entries()]
.filter(([code]) => code.length > 0)
.sort(([a], [b]) => a.localeCompare(b));
return { codes: pairs.map(([c]) => c), names: pairs.map(([, n]) => n) };
}
function finalizeParty() {
const primaryAlias = aliases?.find((a) => a.primary)
|| aliases?.find((a) => a.typeId === '1403')
|| aliases?.[0];
const name = primaryAlias?.nameParts.join(' ') || 'Unnamed designation';
let entityType = 'SANCTIONS_ENTITY_TYPE_ENTITY';
if (profileSubTypeId === '1') entityType = 'SANCTIONS_ENTITY_TYPE_VESSEL';
else if (profileSubTypeId === '2') entityType = 'SANCTIONS_ENTITY_TYPE_AIRCRAFT';
else if (profileFeatures?.some((f) => /birth|citizenship|nationality/i.test(featureTypes.get(f.featureTypeId) || ''))) {
entityType = 'SANCTIONS_ENTITY_TYPE_INDIVIDUAL';
}
const seen = new Map();
for (const feat of profileFeatures ?? []) {
if (!/location/i.test(featureTypes.get(feat.featureTypeId) || '')) continue;
for (const lid of feat.locationIds) {
const loc = locations.get(lid);
if (!loc) continue;
loc.codes.forEach((code, i) => { if (code && !seen.has(code)) seen.set(code, loc.names[i] ?? ''); });
}
}
const sorted = [...seen.entries()].sort(([a], [b]) => a.localeCompare(b));
parties.set(profileId, {
name,
entityType,
countryCodes: sorted.map(([c]) => c),
countryNames: sorted.map(([, n]) => n),
});
}
function finalizeEntry() {
const party = parties.get(entryProfileId);
const name = party?.name || 'Unnamed designation';
const programs = uniqueSorted((entryPrograms ?? []).filter((c) => PROGRAM_CODE_RE.test(c)));
const allDates = [...(entryDates ?? []), ...(entryMeasureDates ?? [])];
const effectiveAt = String(allDates.length > 0 ? Math.max(...allDates) : 0);
const commentNote = (entryNoteComments ?? []).find((c) => c);
const legalNote = (entryLegalIds ?? []).map((id) => legalBasis.get(id) || '').find((n) => n) || '';
const note = compactNote(commentNote || legalNote);
entries.push({
id: `${source.label}:${entryId || entryProfileId}`,
name,
entityType: party?.entityType || 'SANCTIONS_ENTITY_TYPE_ENTITY',
countryCodes: party?.countryCodes ?? [],
countryNames: party?.countryNames ?? [],
programs: programs.length > 0 ? programs : [source.label],
sourceLists: [source.label],
effectiveAt,
isNew: false,
note,
});
}
// ── SAX event handlers ─────────────────────────────────────────────────────
parser.onopentag = (node) => {
const name = local(node.name);
const attrs = node.attributes;
stack.push(name);
text = '';
switch (name) {
// ── Section markers ──
case 'DateOfIssue': inDateOfIssue = true; break;
case 'AreaCodeValues': inAreaCodeValues = true; break;
case 'FeatureTypeValues': inFeatureTypeValues = true; break;
case 'LegalBasisValues': inLegalBasisValues = true; break;
case 'Locations': inLocations = true; break;
case 'DistinctParties': inDistinctParties = true; break;
case 'SanctionsEntries': inSanctionsEntries = true; break;
// ── Reference values ──
case 'AreaCode':
if (inAreaCodeValues) { refId = attrs.ID || ''; refDescription = attrs.Description || ''; }
break;
case 'FeatureType':
if (inFeatureTypeValues) refId = attrs.ID || '';
break;
case 'LegalBasis':
if (inLegalBasisValues) { refId = attrs.ID || ''; refShortRef = attrs.LegalBasisShortRef || ''; }
break;
// ── Locations ──
case 'Location':
if (inLocations) { locId = attrs.ID || ''; locAreaCodeIds = []; }
break;
case 'LocationAreaCode':
if (locAreaCodeIds && attrs.AreaCodeID) locAreaCodeIds.push(attrs.AreaCodeID);
break;
// ── DistinctParty / Profile ──
case 'DistinctParty':
if (inDistinctParties) { partyFixedRef = attrs.FixedRef || ''; aliases = []; profileFeatures = []; }
break;
case 'Profile':
if (inDistinctParties) { profileId = attrs.ID || partyFixedRef; profileSubTypeId = attrs.PartySubTypeID || ''; }
break;
case 'Alias':
if (inDistinctParties) curAlias = { primary: attrs.Primary === 'true', typeId: attrs.AliasTypeID || '', nameParts: [] };
break;
case 'DocumentedName':
if (curAlias) { inDocumentedName = true; namePartsBuf = []; }
break;
case 'Feature':
if (inDistinctParties) curFeature = { featureTypeId: attrs.FeatureTypeID || '', locationIds: [] };
break;
case 'VersionLocation':
if (curFeature && attrs.LocationID) curFeature.locationIds.push(attrs.LocationID);
break;
// ── SanctionsEntry ──
case 'SanctionsEntry':
if (inSanctionsEntries) {
entryId = attrs.ID || ''; entryProfileId = attrs.ProfileID || '';
entryDates = []; entryMeasureDates = []; entryPrograms = []; entryNoteComments = []; entryLegalIds = [];
}
break;
case 'EntryEvent':
if (entryDates) inEntryEventDate = true;
break;
case 'SanctionsMeasure':
if (entryDates) inMeasureDatePeriod = false; // reset, set when we see DatePeriod
break;
case 'DatePeriod':
if (entryMeasureDates) inMeasureDatePeriod = true;
break;
case 'Date':
case 'From':
dateYear = 0; dateMonth = 1; dateDay = 1;
break;
}
};
parser.onclosetag = (rawName) => {
const name = local(rawName);
const t = text.trim();
text = '';
stack.pop();
switch (name) {
// ── DateOfIssue ──
case 'DateOfIssue': inDateOfIssue = false; datasetDate = epoch(doiYear, doiMonth, doiDay); break;
// ── Shared Year/Month/Day (context determined by flags) ──
case 'Year':
if (inDateOfIssue) doiYear = Number(t) || 0;
else dateYear = Number(t) || 0;
break;
case 'Month':
if (inDateOfIssue) doiMonth = Number(t) || 1;
else dateMonth = Number(t) || 1;
break;
case 'Day':
if (inDateOfIssue) doiDay = Number(t) || 1;
else dateDay = Number(t) || 1;
break;
// ── Section close ──
case 'AreaCodeValues': inAreaCodeValues = false; break;
case 'FeatureTypeValues': inFeatureTypeValues = false; break;
case 'LegalBasisValues': inLegalBasisValues = false; break;
case 'Locations': inLocations = false; break;
case 'DistinctParties': inDistinctParties = false; break;
case 'SanctionsEntries': inSanctionsEntries = false; break;
// ── Reference values ──
case 'AreaCode':
if (inAreaCodeValues && refId) areaCodes.set(refId, { code: t, name: refDescription });
break;
case 'FeatureType':
if (inFeatureTypeValues && refId) featureTypes.set(refId, t);
break;
case 'LegalBasis':
if (inLegalBasisValues && refId) legalBasis.set(refId, refShortRef || t);
break;
// ── Locations ──
case 'Location':
if (locAreaCodeIds !== null) {
locations.set(locId, resolveLocation(locId));
locId = ''; locAreaCodeIds = null;
}
break;
// ── DistinctParty / Profile ──
case 'NamePartValue':
if (namePartsBuf !== null && t) namePartsBuf.push(t);
break;
case 'DocumentedName':
if (curAlias && namePartsBuf !== null) { curAlias.nameParts = namePartsBuf; namePartsBuf = null; inDocumentedName = false; }
break;
case 'Alias':
if (curAlias) { aliases.push(curAlias); curAlias = null; }
break;
case 'Feature':
if (curFeature) { profileFeatures.push(curFeature); curFeature = null; }
break;
case 'Profile':
if (inDistinctParties && profileId) finalizeParty();
profileId = ''; profileSubTypeId = ''; aliases = []; profileFeatures = [];
break;
case 'DistinctParty':
partyFixedRef = '';
break;
// ── SanctionsEntry date contexts ──
case 'Date':
if (inEntryEventDate && entryDates) {
const e = epoch(dateYear, dateMonth, dateDay);
if (e > 0) entryDates.push(e);
}
break;
case 'From':
if (inMeasureDatePeriod && entryMeasureDates) {
const e = epoch(dateYear, dateMonth, dateDay);
if (e > 0) entryMeasureDates.push(e);
}
break;
case 'EntryEvent':
inEntryEventDate = false;
break;
case 'SanctionsMeasure':
inMeasureDatePeriod = false;
break;
case 'DatePeriod':
inMeasureDatePeriod = false;
break;
// ── SanctionsEntry leaf data ──
case 'LegalBasisID':
if (entryLegalIds) entryLegalIds.push(t);
break;
case 'Comment':
if (entryPrograms !== null) entryPrograms.push(t);
if (entryNoteComments !== null && t && !PROGRAM_CODE_RE.test(t)) entryNoteComments.push(t);
break;
case 'SanctionsEntry':
if (entryDates !== null) finalizeEntry();
entryId = ''; entryProfileId = ''; entryDates = null; entryMeasureDates = null;
entryPrograms = null; entryNoteComments = null; entryLegalIds = null;
break;
}
};
parser.ontext = (chunk) => { text += chunk; };
parser.oncdata = (chunk) => { text += chunk; };
parser.onerror = (err) => {
parser.resume(); // keep streaming; log but don't abort — partial results are valid
console.warn(` ${source.label}: SAX parse warning: ${err.message}`);
};
parser.onend = () => {
console.log(` ${source.label}: ${(bytesReceived / 1024).toFixed(0)}KB streamed, ${entries.length} entries parsed (${Date.now() - t0}ms)`);
resolve({ entries, datasetDate });
};
// Stream response body through the SAX parser chunk by chunk.
// response.body is a web ReadableStream (Node.js 20 native fetch).
const decoder = new TextDecoder('utf-8');
(async () => {
try {
for await (const chunk of response.body) {
bytesReceived += chunk.byteLength;
parser.write(decoder.decode(chunk, { stream: true }));
}
// Flush any remaining bytes in the decoder
const tail = decoder.decode();
if (tail) parser.write(tail);
parser.close();
} catch (err) {
reject(err);
}
})();
});
}
async function fetchSanctionsPressure() {
@@ -337,8 +466,8 @@ async function fetchSanctionsPressure() {
const hasPrevious = previousIds.size > 0;
console.log(` Previous state: ${hasPrevious ? `${previousIds.size} known IDs` : 'none (first run or expired)'}`);
// Sequential fetch to reduce peak heap: SDN (~120MB XML) then Consolidated.
// Parallel parse would double peak memory and OOM on 512MB Railway containers.
// Sequential fetch: SDN then Consolidated. SAX streaming keeps peak RAM low
// regardless of file size — no full XML string or DOM tree is ever built.
const results = [];
for (const source of OFAC_SOURCES) {
results.push(await fetchSource(source));

View File

@@ -68,47 +68,6 @@ describe('seed: memory safety', () => {
});
});
// ---------------------------------------------------------------------------
// Seed: buildLocationMap must sort code/name as aligned pairs
// ---------------------------------------------------------------------------
describe('seed buildLocationMap: code/name alignment', () => {
it('seed buildLocationMap uses paired sort instead of independent uniqueSorted calls', () => {
const fnStart = seedSrc.indexOf('function buildLocationMap(');
const fnEnd = seedSrc.indexOf('\nfunction extractPartyName(');
const fnBody = seedSrc.slice(fnStart, fnEnd);
assert.match(
fnBody,
/new Map\(mapped\.map/,
'seed buildLocationMap must deduplicate via Map keyed on code',
);
assert.ok(
!fnBody.includes("uniqueSorted(mapped.map((item) => item.code))"),
'seed buildLocationMap must not sort codes independently',
);
assert.ok(
!fnBody.includes("uniqueSorted(mapped.map((item) => item.name))"),
'seed buildLocationMap must not sort names independently',
);
});
it('seed extractPartyCountries deduplicates via Map instead of independent uniqueSorted', () => {
const fnStart = seedSrc.indexOf('function extractPartyCountries(');
const fnEnd = seedSrc.indexOf('\nfunction buildPartyMap(');
const fnBody = seedSrc.slice(fnStart, fnEnd);
assert.match(
fnBody,
/const seen = new Map/,
'seed extractPartyCountries must use a seen Map for deduplication',
);
assert.ok(
!fnBody.includes('uniqueSorted(codes)'),
'seed extractPartyCountries must not sort codes independently',
);
});
});
// ---------------------------------------------------------------------------
// Seed: DEFAULT_RECENT_LIMIT must not exceed handler MAX_ITEMS_LIMIT
// ---------------------------------------------------------------------------

View File

@@ -13,6 +13,9 @@ function normalize(v) {
// Load pure helper functions from the seed script in an isolated vm context.
// This avoids the ESM side-effects (loadEnvFile, runSeed) that fire on import.
// We strip: import lines, loadEnvFile() call, async network functions, runSeed.
// The SAX rewrite replaced all DOM-helper functions (listify, textValue, buildEpoch,
// buildReferenceMaps, buildLocationMap, extractPartyName, etc.) with a streaming
// state machine inside fetchSource. Only pure output-stage helpers remain testable.
// ---------------------------------------------------------------------------
const seedSrc = readFileSync('scripts/seed-sanctions-pressure.mjs', 'utf8');
@@ -21,119 +24,17 @@ const pureSrc = seedSrc
.replace(/loadEnvFile\([^)]+\);/, '')
.replace(/async function fetchSource[\s\S]*/, ''); // remove network + runSeed tail
// Stub XMLParser: only the module-level XML_PARSER constant is constructed at load time;
// the actual parse() method is only called in fetchSource (stripped above).
class XMLParser { parse() { return {}; } }
const ctx = vm.createContext({ console, Date, Math, Number, Array, Map, Set, String, RegExp, XMLParser });
const ctx = vm.createContext({ console, Date, Math, Number, Array, Map, Set, String, RegExp });
vm.runInContext(pureSrc, ctx);
const {
listify,
textValue,
buildEpoch,
uniqueSorted,
compactNote,
extractDocumentedName,
normalizeDateOfIssue,
buildReferenceMaps,
buildLocationMap,
extractPartyName,
resolveEntityType,
extractPartyCountries,
buildPartyMap,
extractPrograms,
extractEffectiveAt,
extractNote,
buildEntriesForDocument,
sortEntries,
buildCountryPressure,
buildProgramPressure,
} = ctx;
// ---------------------------------------------------------------------------
// listify
// ---------------------------------------------------------------------------
describe('listify', () => {
it('wraps a scalar in an array', () => {
assert.deepEqual(normalize(listify('x')), ['x']);
});
it('returns the array as-is', () => {
assert.deepEqual(normalize(listify([1, 2])), [1, 2]);
});
it('returns [] for null', () => {
assert.deepEqual(normalize(listify(null)), []);
});
it('returns [] for undefined', () => {
assert.deepEqual(normalize(listify(undefined)), []);
});
it('wraps a number', () => {
assert.deepEqual(normalize(listify(0)), [0]);
});
});
// ---------------------------------------------------------------------------
// textValue
// ---------------------------------------------------------------------------
describe('textValue', () => {
it('returns empty string for null', () => {
assert.equal(textValue(null), '');
});
it('trims a plain string', () => {
assert.equal(textValue(' hello '), 'hello');
});
it('converts a number', () => {
assert.equal(textValue(42), '42');
});
it('converts a boolean', () => {
assert.equal(textValue(true), 'true');
});
it('extracts #text from an object', () => {
assert.equal(textValue({ '#text': ' inner ' }), 'inner');
});
it('extracts NamePartValue from an object', () => {
assert.equal(textValue({ NamePartValue: ' name ' }), 'name');
});
it('returns empty string for an object with no recognized key', () => {
assert.equal(textValue({ other: 'x' }), '');
});
});
// ---------------------------------------------------------------------------
// buildEpoch
// ---------------------------------------------------------------------------
describe('buildEpoch', () => {
it('returns 0 for null parts', () => {
assert.equal(buildEpoch(null), 0);
});
it('returns 0 when Year is 0', () => {
assert.equal(buildEpoch({ Year: '0', Month: '1', Day: '1' }), 0);
});
it('builds correct UTC epoch', () => {
assert.equal(buildEpoch({ Year: '2023', Month: '6', Day: '15' }), Date.UTC(2023, 5, 15));
});
it('defaults missing Month and Day to 1', () => {
assert.equal(buildEpoch({ Year: '2023' }), Date.UTC(2023, 0, 1));
});
it('clamps Month 0 to 1', () => {
assert.equal(buildEpoch({ Year: '2022', Month: '0', Day: '5' }), Date.UTC(2022, 0, 5));
});
});
// ---------------------------------------------------------------------------
// uniqueSorted
// ---------------------------------------------------------------------------
@@ -180,276 +81,6 @@ describe('compactNote', () => {
});
});
// ---------------------------------------------------------------------------
// extractDocumentedName
// ---------------------------------------------------------------------------
describe('extractDocumentedName', () => {
it('joins multiple DocumentedNamePart values', () => {
const dn = {
DocumentedNamePart: [
{ NamePartValue: 'John' },
{ NamePartValue: 'Doe' },
],
};
assert.equal(extractDocumentedName(dn), 'John Doe');
});
it('falls back to textValue of the whole object when no parts', () => {
assert.equal(extractDocumentedName({ '#text': 'Fallback Name' }), 'Fallback Name');
});
it('returns empty string for null', () => {
assert.equal(extractDocumentedName(null), '');
});
});
// ---------------------------------------------------------------------------
// normalizeDateOfIssue
// ---------------------------------------------------------------------------
describe('normalizeDateOfIssue', () => {
it('returns 0 for null', () => {
assert.equal(normalizeDateOfIssue(null), 0);
});
it('returns correct epoch for valid date parts', () => {
assert.equal(normalizeDateOfIssue({ Year: '2024', Month: '1', Day: '15' }), Date.UTC(2024, 0, 15));
});
});
// ---------------------------------------------------------------------------
// buildReferenceMaps
// ---------------------------------------------------------------------------
describe('buildReferenceMaps', () => {
const doc = {
ReferenceValueSets: {
AreaCodeValues: {
AreaCode: [{ ID: '10', Description: 'Russia', '#text': 'RU' }],
},
FeatureTypeValues: {
FeatureType: [{ ID: '20', '#text': 'Citizenship Country' }],
},
LegalBasisValues: {
LegalBasis: [{ ID: '30', LegalBasisShortRef: 'EO13685' }],
},
},
};
it('builds areaCodes map keyed by ID', () => {
const { areaCodes } = buildReferenceMaps(doc);
assert.deepEqual(normalize(areaCodes.get('10')), { code: 'RU', name: 'Russia' });
});
it('builds featureTypes map keyed by ID', () => {
const { featureTypes } = buildReferenceMaps(doc);
assert.equal(featureTypes.get('20'), 'Citizenship Country');
});
it('builds legalBasis map using LegalBasisShortRef', () => {
const { legalBasis } = buildReferenceMaps(doc);
assert.equal(legalBasis.get('30'), 'EO13685');
});
it('returns empty maps for missing ReferenceValueSets', () => {
const { areaCodes, featureTypes, legalBasis } = buildReferenceMaps({});
assert.equal(areaCodes.size, 0);
assert.equal(featureTypes.size, 0);
assert.equal(legalBasis.size, 0);
});
});
// ---------------------------------------------------------------------------
// buildLocationMap
// ---------------------------------------------------------------------------
describe('buildLocationMap', () => {
it('maps location ID to aligned code/name pairs', () => {
const areaCodes = new Map([
['10', { code: 'RU', name: 'Russia' }],
['11', { code: 'BY', name: 'Belarus' }],
]);
const doc = {
Locations: {
Location: [
{ ID: '200', LocationAreaCode: [{ AreaCodeID: '10' }, { AreaCodeID: '11' }] },
],
},
};
const locations = buildLocationMap(doc, areaCodes);
const loc = locations.get('200');
assert.deepEqual(normalize(loc.codes), ['BY', 'RU']); // sorted alpha
assert.deepEqual(normalize(loc.names), ['Belarus', 'Russia']);
});
it('deduplicates repeated area codes within a location', () => {
const areaCodes = new Map([['10', { code: 'RU', name: 'Russia' }]]);
const doc = {
Locations: {
Location: [
{ ID: '300', LocationAreaCode: [{ AreaCodeID: '10' }, { AreaCodeID: '10' }] },
],
},
};
const locations = buildLocationMap(doc, areaCodes);
assert.deepEqual(normalize(locations.get('300').codes), ['RU']);
});
});
// ---------------------------------------------------------------------------
// resolveEntityType
// ---------------------------------------------------------------------------
describe('resolveEntityType', () => {
it('returns VESSEL for PartySubTypeID 1', () => {
assert.equal(resolveEntityType({ PartySubTypeID: '1' }, new Map()), 'SANCTIONS_ENTITY_TYPE_VESSEL');
});
it('returns AIRCRAFT for PartySubTypeID 2', () => {
assert.equal(resolveEntityType({ PartySubTypeID: '2' }, new Map()), 'SANCTIONS_ENTITY_TYPE_AIRCRAFT');
});
it('returns INDIVIDUAL when a feature type contains "birth"', () => {
const featureTypes = new Map([['99', 'Date of Birth']]);
const profile = {
Feature: [{ FeatureTypeID: '99' }],
};
assert.equal(resolveEntityType(profile, featureTypes), 'SANCTIONS_ENTITY_TYPE_INDIVIDUAL');
});
it('returns INDIVIDUAL when a feature type contains "nationality"', () => {
const featureTypes = new Map([['88', 'Nationality Country']]);
const profile = { Feature: [{ FeatureTypeID: '88' }] };
assert.equal(resolveEntityType(profile, featureTypes), 'SANCTIONS_ENTITY_TYPE_INDIVIDUAL');
});
it('returns ENTITY for non-individual, non-vessel, non-aircraft', () => {
const featureTypes = new Map([['77', 'Address']]);
const profile = { Feature: [{ FeatureTypeID: '77' }] };
assert.equal(resolveEntityType(profile, featureTypes), 'SANCTIONS_ENTITY_TYPE_ENTITY');
});
});
// ---------------------------------------------------------------------------
// extractPartyName
// ---------------------------------------------------------------------------
describe('extractPartyName', () => {
it('uses primary alias DocumentedName', () => {
const profile = {
Identity: [{
Alias: [
{
Primary: 'true',
DocumentedName: { DocumentedNamePart: [{ NamePartValue: 'Corp' }, { NamePartValue: 'LLC' }] },
},
],
}],
};
assert.equal(extractPartyName(profile), 'Corp LLC');
});
it('falls back to first alias when no primary', () => {
const profile = {
Identity: [{
Alias: [
{ DocumentedName: { '#text': 'Fallback Entity' } },
],
}],
};
assert.equal(extractPartyName(profile), 'Fallback Entity');
});
it('returns empty string when no identity', () => {
assert.equal(extractPartyName({}), '');
});
});
// ---------------------------------------------------------------------------
// extractPrograms
// ---------------------------------------------------------------------------
describe('extractPrograms', () => {
it('extracts valid program codes from SanctionsMeasure comments', () => {
const entry = {
SanctionsMeasure: [
{ Comment: 'UKRAINE-EO13685' },
{ Comment: 'RUSSIA-EO14024' },
],
};
const result = extractPrograms(entry);
assert.deepEqual(normalize(result), ['RUSSIA-EO14024', 'UKRAINE-EO13685']); // sorted
});
it('excludes free-text comments that fail the program code regex', () => {
const entry = {
SanctionsMeasure: [{ Comment: 'Blocked for human rights violations' }],
};
assert.deepEqual(normalize(extractPrograms(entry)), []);
});
it('deduplicates program codes', () => {
const entry = {
SanctionsMeasure: [{ Comment: 'IRAN' }, { Comment: 'IRAN' }],
};
assert.deepEqual(normalize(extractPrograms(entry)), ['IRAN']);
});
it('returns empty array for empty entry', () => {
assert.deepEqual(normalize(extractPrograms({})), []);
});
});
// ---------------------------------------------------------------------------
// extractEffectiveAt
// ---------------------------------------------------------------------------
describe('extractEffectiveAt', () => {
it('returns max epoch across EntryEvent dates', () => {
const entry = {
EntryEvent: [
{ Date: { Year: '2020', Month: '1', Day: '1' } },
{ Date: { Year: '2022', Month: '6', Day: '15' } },
],
};
assert.equal(extractEffectiveAt(entry), Date.UTC(2022, 5, 15));
});
it('also considers SanctionsMeasure DatePeriod', () => {
const entry = {
EntryEvent: [{ Date: { Year: '2021', Month: '1', Day: '1' } }],
SanctionsMeasure: [{
DatePeriod: { Start: { From: { Year: '2023', Month: '3', Day: '1' } } },
}],
};
assert.equal(extractEffectiveAt(entry), Date.UTC(2023, 2, 1));
});
it('returns 0 when no dates are present', () => {
assert.equal(extractEffectiveAt({}), 0);
});
});
// ---------------------------------------------------------------------------
// extractNote
// ---------------------------------------------------------------------------
describe('extractNote', () => {
it('prefers free-text SanctionsMeasure comment over legal basis', () => {
const legalBasis = new Map([['1', 'EO13661']]);
const entry = {
SanctionsMeasure: [{ Comment: 'Involved in arms trafficking' }],
EntryEvent: [{ LegalBasisID: '1' }],
};
assert.equal(extractNote(entry, legalBasis), 'Involved in arms trafficking');
});
it('falls back to legal basis short ref when comment is a program code', () => {
const legalBasis = new Map([['1', 'EO13661']]);
const entry = {
SanctionsMeasure: [{ Comment: 'IRAN' }], // valid program code — filtered out
EntryEvent: [{ LegalBasisID: '1' }],
};
assert.equal(extractNote(entry, legalBasis), 'EO13661');
});
it('returns empty string when nothing available', () => {
assert.equal(extractNote({}, new Map()), '');
});
});
// ---------------------------------------------------------------------------
// sortEntries
// ---------------------------------------------------------------------------
@@ -545,129 +176,3 @@ describe('buildProgramPressure', () => {
});
});
// ---------------------------------------------------------------------------
// buildEntriesForDocument — integration
// ---------------------------------------------------------------------------
describe('buildEntriesForDocument', () => {
const doc = {
DateOfIssue: { Year: '2024', Month: '1', Day: '15' },
ReferenceValueSets: {
AreaCodeValues: {
AreaCode: [{ ID: '10', Description: 'Russia', '#text': 'RU' }],
},
FeatureTypeValues: {
FeatureType: [{ ID: '20', '#text': 'Registered Location' }],
},
LegalBasisValues: {
LegalBasis: [{ ID: '30', LegalBasisShortRef: 'EO13685' }],
},
},
Locations: {
Location: [{ ID: '200', LocationAreaCode: [{ AreaCodeID: '10' }] }],
},
DistinctParties: {
DistinctParty: [{
FixedRef: '1001',
Profile: {
ID: '1001',
PartySubTypeID: '4',
Identity: [{
Alias: [{
Primary: 'true',
DocumentedName: {
DocumentedNamePart: [{ NamePartValue: 'Acme' }, { NamePartValue: 'Corp' }],
},
}],
}],
Feature: [{
FeatureTypeID: '20',
FeatureVersion: [{ VersionLocation: [{ LocationID: '200' }] }],
}],
},
}],
},
SanctionsEntries: {
SanctionsEntry: [{
ID: '5001',
ProfileID: '1001',
EntryEvent: [{ Date: { Year: '2022', Month: '3', Day: '1' }, LegalBasisID: '30' }],
SanctionsMeasure: [{ Comment: 'UKRAINE-EO13685' }],
}],
},
};
it('produces one entry with correct id', () => {
const { entries } = buildEntriesForDocument(doc, 'SDN');
assert.equal(entries.length, 1);
assert.equal(entries[0].id, 'SDN:5001');
});
it('resolves party name from DistinctParties', () => {
const { entries } = buildEntriesForDocument(doc, 'SDN');
assert.equal(entries[0].name, 'Acme Corp');
});
it('resolves country codes and names from features', () => {
const { entries } = buildEntriesForDocument(doc, 'SDN');
assert.deepEqual(normalize(entries[0].countryCodes), ['RU']);
assert.deepEqual(normalize(entries[0].countryNames), ['Russia']);
});
it('resolves programs from SanctionsMeasure', () => {
const { entries } = buildEntriesForDocument(doc, 'SDN');
assert.deepEqual(normalize(entries[0].programs), ['UKRAINE-EO13685']);
});
it('sets effectiveAt from EntryEvent date', () => {
const { entries } = buildEntriesForDocument(doc, 'SDN');
assert.equal(entries[0].effectiveAt, String(Date.UTC(2022, 2, 1)));
});
it('sets isNew to false by default', () => {
const { entries } = buildEntriesForDocument(doc, 'SDN');
assert.equal(entries[0].isNew, false);
});
it('returns correct datasetDate', () => {
const { datasetDate } = buildEntriesForDocument(doc, 'SDN');
assert.equal(datasetDate, Date.UTC(2024, 0, 15));
});
it('falls back to sourceLabel as program when no valid program codes', () => {
const docNoProgram = {
...doc,
SanctionsEntries: {
SanctionsEntry: [{
ID: '5002',
ProfileID: '1001',
EntryEvent: [],
SanctionsMeasure: [{ Comment: 'Suspected money laundering' }],
}],
},
};
const { entries } = buildEntriesForDocument(docNoProgram, 'SDN');
assert.deepEqual(normalize(entries[0].programs), ['SDN']);
});
it('sets sourceLists to [sourceLabel]', () => {
const { entries } = buildEntriesForDocument(doc, 'CONSOLIDATED');
assert.deepEqual(normalize(entries[0].sourceLists), ['CONSOLIDATED']);
});
it('handles empty SanctionsEntries gracefully', () => {
const emptyDoc = { ...doc, SanctionsEntries: {} };
const { entries } = buildEntriesForDocument(emptyDoc, 'SDN');
assert.equal(entries.length, 0);
});
it('uses Unnamed designation when party not found', () => {
const docNoParty = {
...doc,
SanctionsEntries: {
SanctionsEntry: [{ ID: '9999', ProfileID: '9999', EntryEvent: [], SanctionsMeasure: [] }],
},
};
const { entries } = buildEntriesForDocument(docNoParty, 'SDN');
assert.equal(entries[0].name, 'Unnamed designation');
});
});