feat(energy): IEA/OPEC energy intelligence RSS feed (#2713)

* feat(seeds): IEA and OPEC energy intelligence RSS feed

- Add scripts/seed-energy-intelligence.mjs: parses IEA news, IEA reports,
  and OPEC press RSS feeds; filters by 20 energy keywords; deduplicates by
  URL (keeps most recent); excludes items older than 30 days; limits to 30
  most recent items; TTL 86400s (24h); validates >= 3 items
- Add tests/energy-intelligence-seed.test.mjs: 10 tests covering parseRssItems,
  filterEnergyRelevant, deduplicateByUrl, age filter, and key constants
- Add ENERGY_INTELLIGENCE_KEY to server/_shared/cache-keys.ts
- Add energyIntelligence to BOOTSTRAP_KEYS and SEED_META in api/health.js
- Add seed-energy-intelligence service override to railway-set-watch-paths.mjs
  with 6h cron schedule

* fix(seeds): replace dead IEA RSS (404) with OilPrice.com; keep OPEC best-effort

* fix(seeds): fix energyIntelligence health placement, key format, validate export, entity decoding

P1: energyIntelligence was in health.js BOOTSTRAP_KEYS but absent from
api/bootstrap.js and BOOTSTRAP_CACHE_KEYS. The feed has no SPA consumer
(server-side read only via chat-analyst-context), so it belongs in
STANDALONE_KEYS in health.js, not BOOTSTRAP_KEYS. Moved accordingly:
health monitoring is preserved via SEED_META, and the bootstrap test
invariants (every bootstrap key must have a getHydratedData consumer)
are satisfied.

Key format: CANONICAL_KEY renamed energy:intelligence:v1:feed →
energy:intelligence:feed:v1 to comply with the :v\d+$ convention enforced
by bootstrap.test.mjs. Updated in health.js, cache-keys.ts (standalone
export), and seed-energy-intelligence.mjs.

P2: export validate() from seed-energy-intelligence.mjs and add tests
covering the skip path (< 3 items → false, exactly 3 → true, > 3 → true).
OPEC is best-effort and OilPrice is primary, so sub-threshold runs are a
real production scenario.

Quality: expand decodeHtmlEntities to handle numeric decimal/hex character
references (&#8217; &#x2019;) and common named entities (&apos; &hellip;
&mdash; &ndash; &lsquo; &rsquo; &ldquo; &rdquo;). &amp; decoded last to
handle double-encoded sequences correctly. Five new tests added.

* fix(seeds): remove unused extendExistingTtl import from seed-energy-intelligence
This commit is contained in:
Elie Habib
2026-04-05 13:26:19 +04:00
committed by GitHub
parent 4ac51c6f32
commit 066712e859
5 changed files with 455 additions and 0 deletions

View File

@@ -137,6 +137,7 @@ const STANDALONE_KEYS = {
productCatalog: 'product-catalog:v2',
energyExposure: 'energy:exposure:v1:index',
regulatoryActions: 'regulatory:actions:v1',
energyIntelligence: 'energy:intelligence:feed:v1',
};
const SEED_META = {
@@ -256,6 +257,7 @@ const SEED_META = {
regulatoryActions: { key: 'seed-meta:regulatory:actions', maxStaleMin: 360 }, // 2h cron; 360min = 3x interval
electricityPrices: { key: 'seed-meta:energy:electricity-prices', maxStaleMin: 2880 }, // daily cron (14:00 UTC); 2880min = 48h = 2x interval
gasStorageCountries: { key: 'seed-meta:energy:gas-storage-countries', maxStaleMin: 2880 }, // daily cron at 10:30 UTC; 2880min = 48h = 2x interval
energyIntelligence: { key: 'seed-meta:energy:intelligence', maxStaleMin: 720 }, // 6h cron; 720min = 2x interval
};
// Standalone keys that are populated on-demand by RPC handlers (not seeds).

View File

@@ -81,6 +81,15 @@ const SERVICE_OVERRIDES = {
startCommand: 'node seed-gas-storage-countries.mjs',
cronSchedule: '30 10 * * *',
},
'seed-energy-intelligence': {
watchPatterns: [
'scripts/seed-energy-intelligence.mjs',
'scripts/_seed-utils.mjs',
'scripts/package.json',
],
startCommand: 'node seed-energy-intelligence.mjs',
cronSchedule: '0 */6 * * *',
},
};
function getToken() {

View File

@@ -0,0 +1,202 @@
#!/usr/bin/env node
import { loadEnvFile, CHROME_UA, runSeed } from './_seed-utils.mjs';
loadEnvFile(import.meta.url);
export const CANONICAL_KEY = 'energy:intelligence:feed:v1';
export const INTELLIGENCE_TTL_SECONDS = 86400; // 24h = 4× 6h interval (gold standard: TTL ≥ 3× interval)
const MAX_ITEMS = 30;
const RSS_MAX_BYTES = 500_000;
const AGE_LIMIT_MS = 30 * 24 * 3600 * 1000; // 30 days
// Note: IEA removed public RSS feeds (https://www.iea.org/rss/*.xml returns 404).
// OPEC RSS is Cloudflare-protected — kept as best-effort (works from Railway IPs).
// OilPrice.com provides reliable energy intelligence coverage as primary source.
const FEEDS = [
{ url: 'https://oilprice.com/rss/main', source: 'OilPrice', label: 'oilprice-main' },
{ url: 'https://www.opec.org/opec_web/en/press_room/rss.htm', source: 'OPEC', label: 'opec-press' },
];
export const ENERGY_KEYWORDS = [
'oil', 'gas', 'lng', 'coal', 'energy', 'opec', 'refinery', 'petroleum',
'electricity', 'power', 'renewable', 'nuclear', 'barrel', 'crude',
'storage', 'pipeline', 'fuel', 'carbon', 'emissions',
];
export function stableHash(str) {
let h = 0;
for (let i = 0; i < str.length; i++) h = (Math.imul(31, h) + str.charCodeAt(i)) | 0;
return Math.abs(h).toString(36);
}
function decodeHtmlEntities(text) {
return text
.replace(/&#x([0-9a-fA-F]+);/g, (_, hex) => String.fromCodePoint(parseInt(hex, 16)))
.replace(/&#(\d+);/g, (_, dec) => String.fromCodePoint(parseInt(dec, 10)))
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&apos;|&#39;/g, "'")
.replace(/&nbsp;/g, ' ')
.replace(/&hellip;/g, '…')
.replace(/&mdash;/g, '—')
.replace(/&ndash;/g, '')
.replace(/&lsquo;|&rsquo;/g, "'")
.replace(/&ldquo;|&rdquo;/g, '"');
}
function extractTag(block, tagName) {
const re = new RegExp(`<${tagName}[^>]*>(?:<!\\[CDATA\\[)?([\\s\\S]*?)(?:\\]\\]>)?<\\/${tagName}>`, 'i');
return (block.match(re) || [])[1]?.trim() || '';
}
function cleanSummary(raw) {
return decodeHtmlEntities(raw).replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim().slice(0, 300);
}
function parseDateMs(block) {
const raw = extractTag(block, 'pubDate')
|| extractTag(block, 'published')
|| extractTag(block, 'updated')
|| extractTag(block, 'dc:date');
if (!raw) return 0;
const ms = new Date(raw).getTime();
return Number.isFinite(ms) ? ms : 0;
}
function extractLink(block) {
const direct = extractTag(block, 'link');
if (direct) return decodeHtmlEntities(direct).trim();
const href = (block.match(/<link[^>]*\bhref=(["'])(.*?)\1[^>]*\/?>/i) || [])[2] || '';
return decodeHtmlEntities(href).trim();
}
export function parseRssItems(xml, source) {
const bounded = xml.length > RSS_MAX_BYTES ? xml.slice(0, RSS_MAX_BYTES) : xml;
const items = [];
const seenIds = new Set();
const pushParsedItem = (block, summaryTags) => {
const title = decodeHtmlEntities(extractTag(block, 'title'));
const url = extractLink(block);
const publishedAt = parseDateMs(block);
const rawSummary = summaryTags.map((tag) => extractTag(block, tag)).find(Boolean) || '';
if (!title || !url || !publishedAt) return;
const id = `${source.toLowerCase()}-${stableHash(url)}-${publishedAt}`;
if (seenIds.has(id)) return;
seenIds.add(id);
items.push({
id,
title,
url,
source,
publishedAt,
summary: cleanSummary(rawSummary),
});
};
const itemRe = /<item\b[^>]*>([\s\S]*?)<\/item>/gi;
let match;
while ((match = itemRe.exec(bounded)) !== null) {
pushParsedItem(match[1], ['description', 'summary', 'content:encoded']);
}
const entryRe = /<entry\b[^>]*>([\s\S]*?)<\/entry>/gi;
while ((match = entryRe.exec(bounded)) !== null) {
pushParsedItem(match[1], ['summary', 'content']);
}
return items;
}
export function filterEnergyRelevant(items) {
return items.filter((item) => {
const text = `${item.title} ${item.summary}`.toLowerCase();
return ENERGY_KEYWORDS.some((kw) => text.includes(kw));
});
}
export function deduplicateByUrl(items) {
const byUrl = new Map();
for (const item of items) {
const key = stableHash(item.url);
const existing = byUrl.get(key);
if (!existing || item.publishedAt > existing.publishedAt) {
byUrl.set(key, item);
}
}
return Array.from(byUrl.values());
}
async function fetchFeed(feed) {
try {
const resp = await fetch(feed.url, {
headers: {
Accept: 'application/rss+xml, application/xml, text/xml, */*',
'User-Agent': CHROME_UA,
},
signal: AbortSignal.timeout(15_000),
});
if (!resp.ok) {
console.warn(`[EnergyIntel] ${feed.label} HTTP ${resp.status}`);
return [];
}
const xml = await resp.text();
const items = parseRssItems(xml, feed.source);
console.log(`[EnergyIntel] ${feed.label}: ${items.length} raw items`);
return items;
} catch (e) {
console.warn(`[EnergyIntel] ${feed.label} fetch error:`, e?.message || e);
return [];
}
}
async function fetchEnergyIntelligence() {
const settled = await Promise.allSettled(FEEDS.map(fetchFeed));
const allItems = [];
for (const result of settled) {
if (result.status === 'fulfilled') allItems.push(...result.value);
}
if (allItems.length === 0) {
throw new Error('All energy intelligence feeds returned 0 items');
}
const now = Date.now();
const recent = allItems.filter((item) => item.publishedAt >= now - AGE_LIMIT_MS);
const relevant = filterEnergyRelevant(recent);
const deduped = deduplicateByUrl(relevant);
deduped.sort((a, b) => b.publishedAt - a.publishedAt);
const limited = deduped.slice(0, MAX_ITEMS);
console.log(`[EnergyIntel] ${allItems.length} raw → ${recent.length} recent → ${relevant.length} relevant → ${deduped.length} deduped → ${limited.length} final`);
return { items: limited, fetchedAt: now, count: limited.length };
}
export function validate(data) {
return Array.isArray(data?.items) && data.items.length >= 3;
}
export { CANONICAL_KEY as ENERGY_INTELLIGENCE_KEY };
if (process.argv[1]?.endsWith('seed-energy-intelligence.mjs')) {
runSeed('energy', 'intelligence', CANONICAL_KEY, fetchEnergyIntelligence, {
validateFn: validate,
ttlSeconds: INTELLIGENCE_TTL_SECONDS,
sourceVersion: 'energy-intel-rss-v1',
recordCount: (data) => data?.items?.length || 0,
}).catch((err) => {
const _cause = err.cause ? ` (cause: ${err.cause.message || err.cause.code || err.cause})` : '';
console.error('FATAL:', (err.message || err) + _cause);
process.exit(1);
});
}

View File

@@ -56,6 +56,7 @@ export const GAS_STORAGE_KEY_PREFIX = 'energy:gas-storage:v1:';
export const GAS_STORAGE_COUNTRIES_KEY = 'energy:gas-storage:v1:_countries';
export const SPR_KEY = 'economic:spr:v1';
export const REFINERY_INPUTS_KEY = 'economic:refinery-inputs:v1';
export const ENERGY_INTELLIGENCE_KEY = 'energy:intelligence:feed:v1';
/**
* Static cache keys for the bootstrap endpoint.
@@ -141,6 +142,7 @@ export const BOOTSTRAP_CACHE_KEYS: Record<string, string> = {
electricityPrices: 'energy:electricity:v1:index',
};
export const BOOTSTRAP_TIERS: Record<string, 'slow' | 'fast'> = {
bisPolicy: 'slow', bisExchange: 'slow', bisCredit: 'slow',
minerals: 'slow', giving: 'slow', sectors: 'slow',

View File

@@ -0,0 +1,240 @@
import { describe, it } from 'node:test';
import assert from 'node:assert/strict';
import {
parseRssItems,
filterEnergyRelevant,
deduplicateByUrl,
validate,
ENERGY_INTELLIGENCE_KEY,
INTELLIGENCE_TTL_SECONDS,
} from '../scripts/seed-energy-intelligence.mjs';
// ---------------------------------------------------------------------------
// Fixtures
// ---------------------------------------------------------------------------
const MINIMAL_RSS = `<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Test Feed</title>
<item>
<title>IEA warns of tight LNG supply heading into summer 2026</title>
<link>https://www.iea.org/news/iea-warns-lng-supply-2026</link>
<pubDate>Sat, 05 Apr 2026 10:00:00 +0000</pubDate>
<description>The International Energy Agency said global LNG markets are tightening.</description>
</item>
<item>
<title>OPEC maintains production cuts amid oil demand uncertainty</title>
<link>https://www.opec.org/news/opec-production-cuts</link>
<pubDate>Fri, 04 Apr 2026 08:00:00 +0000</pubDate>
<description>OPEC members agreed to maintain current crude oil production quotas.</description>
</item>
</channel>
</rss>`;
const CDATA_RSS = `<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>CDATA Feed</title>
<item>
<title><![CDATA[IEA Report: Global Energy Review 2026 & Oil Market Forecast]]></title>
<link>https://www.iea.org/reports/global-energy-review-2026</link>
<pubDate>Thu, 03 Apr 2026 12:00:00 +0000</pubDate>
<description><![CDATA[A comprehensive overview of the global energy market with <strong>oil</strong> and <em>gas</em> trends.]]></description>
</item>
</channel>
</rss>`;
// ---------------------------------------------------------------------------
// parseRssItems
// ---------------------------------------------------------------------------
describe('parseRssItems', () => {
it('extracts title, url, publishedAt from a minimal RSS XML fixture', () => {
const items = parseRssItems(MINIMAL_RSS, 'IEA');
assert.equal(items.length, 2);
const first = items[0];
assert.equal(first.title, 'IEA warns of tight LNG supply heading into summer 2026');
assert.equal(first.url, 'https://www.iea.org/news/iea-warns-lng-supply-2026');
assert.ok(typeof first.publishedAt === 'number' && first.publishedAt > 0, 'publishedAt should be a positive number');
assert.equal(first.source, 'IEA');
});
it('handles CDATA-wrapped titles', () => {
const items = parseRssItems(CDATA_RSS, 'IEA');
assert.equal(items.length, 1);
assert.equal(items[0].title, 'IEA Report: Global Energy Review 2026 & Oil Market Forecast');
assert.ok(items[0].summary.length > 0);
});
});
// ---------------------------------------------------------------------------
// filterEnergyRelevant
// ---------------------------------------------------------------------------
describe('filterEnergyRelevant', () => {
it("keeps items with 'oil' in title, drops items with no energy keywords", () => {
const items = [
{ id: '1', title: 'Oil prices surge on OPEC cuts', url: 'https://example.com/1', source: 'IEA', publishedAt: Date.now(), summary: '' },
{ id: '2', title: 'Latest sports results from the weekend', url: 'https://example.com/2', source: 'IEA', publishedAt: Date.now(), summary: 'Football match highlights and scores.' },
{ id: '3', title: 'Tech startup raises funding round', url: 'https://example.com/3', source: 'IEA', publishedAt: Date.now(), summary: 'Silicon Valley venture capital news.' },
];
const filtered = filterEnergyRelevant(items);
assert.equal(filtered.length, 1);
assert.equal(filtered[0].id, '1');
});
it("is case-insensitive — 'LNG' in title matches 'lng' keyword", () => {
const items = [
{ id: '1', title: 'LNG exports hit record highs in Q1 2026', url: 'https://example.com/1', source: 'IEA', publishedAt: Date.now(), summary: '' },
];
const filtered = filterEnergyRelevant(items);
assert.equal(filtered.length, 1);
});
it('matches keyword in summary when title has no keyword', () => {
const items = [
{ id: '1', title: 'Market update for April', url: 'https://example.com/1', source: 'IEA', publishedAt: Date.now(), summary: 'Crude oil inventories fell sharply last week.' },
];
const filtered = filterEnergyRelevant(items);
assert.equal(filtered.length, 1);
});
});
// ---------------------------------------------------------------------------
// deduplicateByUrl
// ---------------------------------------------------------------------------
describe('deduplicateByUrl', () => {
it('same URL appears only once, keeping the most recent by publishedAt', () => {
const url = 'https://www.iea.org/news/duplicate-story';
const older = { id: 'a', title: 'Old version', url, source: 'IEA', publishedAt: 1000, summary: '' };
const newer = { id: 'b', title: 'Updated version', url, source: 'IEA', publishedAt: 2000, summary: '' };
const items = [older, newer];
const deduped = deduplicateByUrl(items);
assert.equal(deduped.length, 1);
assert.equal(deduped[0].publishedAt, 2000);
assert.equal(deduped[0].id, 'b');
});
it('keeps distinct URLs unchanged', () => {
const items = [
{ id: '1', title: 'Story A', url: 'https://www.iea.org/a', source: 'IEA', publishedAt: 1000, summary: '' },
{ id: '2', title: 'Story B', url: 'https://www.iea.org/b', source: 'IEA', publishedAt: 2000, summary: '' },
];
const deduped = deduplicateByUrl(items);
assert.equal(deduped.length, 2);
});
});
// ---------------------------------------------------------------------------
// Age filter integration
// ---------------------------------------------------------------------------
describe('age filter', () => {
it('item older than 30 days is excluded via AGE_LIMIT_MS threshold', () => {
const now = Date.now();
const oldTs = now - (31 * 24 * 3600 * 1000);
const AGE_LIMIT_MS = 30 * 24 * 3600 * 1000;
const items = [
{ id: 'old', title: 'Old oil report', url: 'https://example.com/old', source: 'IEA', publishedAt: oldTs, summary: '' },
{ id: 'new', title: 'New gas update', url: 'https://example.com/new', source: 'IEA', publishedAt: now, summary: '' },
];
const recent = items.filter((item) => item.publishedAt >= now - AGE_LIMIT_MS);
assert.equal(recent.length, 1);
assert.equal(recent[0].id, 'new');
});
});
// ---------------------------------------------------------------------------
// Exported key constants
// ---------------------------------------------------------------------------
describe('exported constants', () => {
it("ENERGY_INTELLIGENCE_KEY === 'energy:intelligence:feed:v1'", () => {
assert.equal(ENERGY_INTELLIGENCE_KEY, 'energy:intelligence:feed:v1');
});
it('INTELLIGENCE_TTL_SECONDS >= 24 * 3600 (24h minimum)', () => {
assert.ok(
INTELLIGENCE_TTL_SECONDS >= 24 * 3600,
`TTL ${INTELLIGENCE_TTL_SECONDS}s is less than 24h minimum`,
);
});
});
// ---------------------------------------------------------------------------
// validate — the gate that controls skip vs. publish in runSeed
// ---------------------------------------------------------------------------
// OPEC is best-effort and OilPrice is the primary source, so fewer-than-3
// items is a real production scenario. A regression here would ship with all
// other tests green while runSeed silently extends old TTLs instead of writing.
describe('validate', () => {
it('returns false for null', () => {
assert.equal(validate(null), false);
});
it('returns false when items is missing', () => {
assert.equal(validate({}), false);
});
it('returns false for fewer than 3 items', () => {
assert.equal(validate({ items: [] }), false);
assert.equal(validate({ items: [{ url: 'a' }] }), false);
assert.equal(validate({ items: [{ url: 'a' }, { url: 'b' }] }), false);
});
it('returns true for exactly 3 items', () => {
assert.equal(validate({ items: [{ url: 'a' }, { url: 'b' }, { url: 'c' }] }), true);
});
it('returns true for more than 3 items', () => {
const items = Array.from({ length: 10 }, (_, i) => ({ url: `https://example.com/${i}` }));
assert.equal(validate({ items }), true);
});
});
// ---------------------------------------------------------------------------
// decodeHtmlEntities — numeric and extended named entity handling
// ---------------------------------------------------------------------------
describe('decodeHtmlEntities via parseRssItems title', () => {
const wrapInRss = (title) => `<rss version="2.0"><channel>
<item>
<title>${title}</title>
<link>https://example.com/1</link>
<pubDate>Sun, 05 Apr 2026 10:00:00 +0000</pubDate>
</item>
</channel></rss>`;
it('decodes numeric decimal entity &#8217; → right single quote', () => {
const items = parseRssItems(wrapInRss('Europe&#8217;s gas storage'), 'Test');
assert.ok(items[0].title.includes('\u2019'), `Expected right quote, got: ${items[0].title}`);
});
it('decodes numeric hex entity &#x2019; → right single quote', () => {
const items = parseRssItems(wrapInRss('Europe&#x2019;s gas'), 'Test');
assert.ok(items[0].title.includes('\u2019'), `Expected right quote, got: ${items[0].title}`);
});
it('decodes &mdash; → em dash', () => {
const items = parseRssItems(wrapInRss('Oil prices &mdash; weekly review'), 'Test');
assert.ok(items[0].title.includes('—'), `Expected em dash, got: ${items[0].title}`);
});
it('decodes &hellip; → ellipsis', () => {
const items = parseRssItems(wrapInRss('OPEC output cuts&hellip;'), 'Test');
assert.ok(items[0].title.includes('…'), `Expected ellipsis, got: ${items[0].title}`);
});
it('decodes &apos; → apostrophe', () => {
const items = parseRssItems(wrapInRss('Europe&apos;s energy'), 'Test');
assert.ok(items[0].title.includes("'"), `Expected apostrophe, got: ${items[0].title}`);
});
});