Files
worldmonitor/scripts/shared/geo-extract.mjs
Elie Habib 02555671f2 refactor: consolidate country name/code mappings into single canonical sources (#2676)
* refactor(country-maps): consolidate country name/ISO maps

Expand shared/country-names.json from 265 to 309 entries by merging
geojson names, COUNTRY_ALIAS_MAP, upstream API variants (World Bank,
WHO, UN, FAO), and seed-correlation extras.

Add ISO3 map generator (generate-iso3-maps.cjs) producing
iso3-to-iso2.json (239 entries) and iso2-to-iso3.json (239 entries)
with TWN and XKX supplements.

Add build-country-names.cjs for reproducible expansion from all sources.
Sync scripts/shared/ copies for edge-function test compatibility.

* refactor: consolidate country name/code mappings into single canonical sources

Eliminates fragmented country mapping across the repo. Every feature
(resilience, conflict, correlation, intelligence) was maintaining its
own partial alias map.

Data consolidation:
- Expand shared/country-names.json from 265 to 302 entries covering
  World Bank, WHO, UN, FAO, and correlation script naming variants
- Generate shared/iso3-to-iso2.json (239 entries) and
  shared/iso2-to-iso3.json from countries.geojson + supplements
  (Taiwan TWN, Kosovo XKX)

Consumer migrations:
- _country-resolver.mjs: delete COUNTRY_ALIAS_MAP (37 entries),
  replace 2MB geojson parse with 5KB iso3-to-iso2.json
- conflict/_shared.ts: replace 33-entry ISO2_TO_ISO3 literal
- seed-conflict-intel.mjs: replace 20-entry ISO2_TO_ISO3 literal
- _dimension-scorers.ts: replace geojson-based ISO3 construction
- get-risk-scores.ts: replace 31-entry ISO3_TO_ISO2 literal
- seed-correlation.mjs: replace 102-entry COUNTRY_NAME_TO_ISO2
  and 90-entry ISO3_TO_ISO2, use resolveIso2() from canonical
  resolver, lower short-alias threshold to 2 chars with word
  boundary matching, export matchCountryNamesInText(), add isMain
  guard

Tests:
- New tests/country-resolver.test.mjs with structural validation,
  parity regression for all 37 old aliases, ISO3 bidirectional
  consistency, and Taiwan/Kosovo assertions
- Updated resilience seed test for new resolver signature

Net: -190 lines, 0 hardcoded country maps remaining

* fix: normalize raw text before country name matching

Text matchers (geo-extract, seed-security-advisories, seed-correlation)
were matching normalized keys against raw text containing diacritics
and punctuation. "Curaçao", "Timor-Leste", "Hong Kong S.A.R." all
failed to resolve after country-names.json keys were normalized.

Fix: apply NFKD + diacritic stripping + punctuation normalization to
input text before matching, same transform used on the keys.

Also add "hong kong" and "sao tome" as short-form keys for bigram
headline matching in geo-extract.

* fix: remove 'u s' alias that caused US/VI misattribution

'u s' in country-names.json matched before 'u s virgin islands' in
geo-extract's bigram scanner, attributing Virgin Islands headlines
to US. Removed since 'usa', 'united states', and the uppercase US
expansion already cover the United States.
2026-04-04 15:38:02 +04:00

126 lines
4.8 KiB
JavaScript

/**
* Lightweight geopolitical keyword → ISO2 extractor.
* Uses country-names.json as the base, extended with common city/region aliases
* and short-form geopolitical names that appear frequently in news headlines.
*/
import { createRequire } from 'module';
import { fileURLToPath } from 'url';
import { dirname, join } from 'path';
const require = createRequire(import.meta.url);
const __dirname = dirname(fileURLToPath(import.meta.url));
const countryNames = require(join(__dirname, 'country-names.json'));
// City/region/capital aliases → ISO2 not covered by country-names.json
const ALIAS_MAP = {
// Major capitals and common short forms
'moscow': 'RU', 'kremlin': 'RU', 'russian': 'RU',
'beijing': 'CN', 'chinese': 'CN', 'prc': 'CN',
'washington': 'US', 'american': 'US', 'pentagon': 'US',
'kyiv': 'UA', 'ukrainian': 'UA',
'tehran': 'IR', 'iranian': 'IR',
'pyongyang': 'KP', 'north korean': 'KP',
'taipei': 'TW', 'taiwanese': 'TW',
'riyadh': 'SA', 'saudi': 'SA',
'tel aviv': 'IL', 'israeli': 'IL',
'gaza': 'PS', 'west bank': 'PS', 'palestinian': 'PS',
'damascus': 'SY', 'syrian': 'SY',
'kabul': 'AF', 'afghan': 'AF',
'islamabad': 'PK', 'pakistani': 'PK',
'new delhi': 'IN', 'indian': 'IN',
'ankara': 'TR', 'turkish': 'TR',
'berlin': 'DE', 'german': 'DE',
'paris': 'FR', 'french': 'FR',
'london': 'GB', 'british': 'GB', 'uk': 'GB',
'tokyo': 'JP', 'japanese': 'JP',
'seoul': 'KR', 'south korean': 'KR',
'manila': 'PH', 'philippine': 'PH',
'hanoi': 'VN', 'vietnamese': 'VN',
'caracas': 'VE', 'venezuelan': 'VE',
'havana': 'CU', 'cuban': 'CU',
'minsk': 'BY', 'belarusian': 'BY',
'belgrade': 'RS', 'serbian': 'RS',
'warsaw': 'PL', 'polish': 'PL',
'budapest': 'HU', 'hungarian': 'HU',
'prague': 'CZ', 'czech': 'CZ',
'baghdad': 'IQ', 'iraqi': 'IQ',
'sanaa': 'YE', 'yemeni': 'YE',
'tripoli': 'LY', 'libyan': 'LY',
'khartoum': 'SD', 'sudanese': 'SD',
'addis ababa': 'ET', 'ethiopian': 'ET',
'nairobi': 'KE', 'kenyan': 'KE',
'lagos': 'NG', 'nigerian': 'NG',
'pretoria': 'ZA', 'south african': 'ZA',
'brasilia': 'BR', 'brazilian': 'BR',
'bogota': 'CO', 'colombian': 'CO',
'buenos aires': 'AR', 'argentine': 'AR',
'lima': 'PE', 'peruvian': 'PE',
'mexico city': 'MX', 'mexican': 'MX',
'ottawa': 'CA', 'canadian': 'CA',
'canberra': 'AU', 'australian': 'AU',
// Geo regions / alliances used in headlines
// XX = supranational/multi-country marker; extractCountryCode() returns null for these
'nato': 'XX',
'eu': 'XX',
'europe': 'XX',
'ukraine': 'UA',
'taiwan': 'TW',
};
// Unigrams that are ambiguous in English news (person names, US states, etc.).
// These fire too often as false positives when matched as bare words.
// Bigram aliases (e.g. 'south africa') still work; only bare single-word matches are blocked.
const UNIGRAM_STOPWORDS = new Set([
'chad', // common English given name
'jordan', // common English given name + US-adjacent context
'georgia', // US state
'niger', // easily confused; 'nigerian' alias covers the country
'guinea', // 'guinea' appears in many compound names (Equatorial Guinea, etc.)
'mali', // common suffix in names (Somali, Bengali, etc.) — 'malian' is rare in headlines
'peru', // low geopolitical frequency; false positives in product names
]);
// Build a merged lookup (alias map takes precedence over country-names.json)
const LOOKUP = {};
for (const [name, iso2] of Object.entries(countryNames)) {
LOOKUP[name.toLowerCase()] = iso2;
}
for (const [alias, iso2] of Object.entries(ALIAS_MAP)) {
LOOKUP[alias.toLowerCase()] = iso2;
}
/**
* Extract the first matching ISO2 country code from a text string.
* Returns null if no match found.
* @param {string} text
* @returns {string|null}
*/
export function extractCountryCode(text) {
if (!text) return null;
// Normalize uppercase `US` (country abbreviation) to `united states` before lowercasing,
// so it survives the stopword pass. Lowercase `us` (pronoun) has no equivalent expansion
// and is stopped by UNIGRAM_STOPWORDS. `\b` avoids matching inside words like "plus".
const normalized = text.replace(/\bUS\b/g, 'United States')
.normalize('NFKD').replace(/\p{Diacritic}/gu, '').toLowerCase()
.replace(/['.(),/-]/g, ' ');
const words = normalized.split(/\s+/).filter(Boolean);
for (let i = 0; i < words.length; i++) {
if (i < words.length - 1) {
const left = words[i].replace(/[^a-z]/g, '');
const right = words[i + 1].replace(/[^a-z]/g, '');
if (left && right) {
const bigram = `${left} ${right}`;
if (LOOKUP[bigram] && LOOKUP[bigram] !== 'XX') return LOOKUP[bigram];
}
}
const clean = words[i].replace(/[^a-z]/g, '');
if (clean.length < 2) continue;
if (UNIGRAM_STOPWORDS.has(clean)) continue;
if (LOOKUP[clean] && LOOKUP[clean] !== 'XX') return LOOKUP[clean];
}
return null;
}