mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
feat(seed): learned routes cache for grocery basket — skip EXA on known-good URLs (#1981)
* feat(seed): add learned routes cache to grocery basket seed
Persists successful EXA/Firecrawl URL discoveries in Redis so subsequent
runs skip the expensive EXA search for known-good (country, item) pairs.
Strategy per item:
1. Direct fetch + matchPrice on learned URL (free)
2. Firecrawl on learned URL if step 1 fails (handles JS SPAs)
3. Full EXA search only when learned route fails or is absent
4. Saves newly discovered URL as learned route for next run
Safety guarantees matching the Codex review:
- isAllowedRouteHost() validates hostname against country.sites allowlist
before both saving and replaying (prevents stored-SSRF)
- tryDirectFetch() applies CURRENCY_MIN + ITEM_USD_MAX bulk-price guards
identical to the existing EXA and Firecrawl paths
- failsSinceSuccess >= 2 triggers true DEL (not TTL wait)
- SET/DEL conflict resolved: effectiveDeletes filters keys in updates;
DELs sent before SETs in pipeline
- All operations non-fatal: pipeline failures log warnings, seed continues
New exports in _seed-utils.mjs: isAllowedRouteHost, bulkReadLearnedRoutes,
bulkWriteLearnedRoutes (1 pipeline read + 1 pipeline write per run).
BigMac deferred to Phase 2 (uses EXA summaries from aggregator pages).
Estimated savings: ~63 of 90 EXA calls skipped per run at 70% hit rate.
* test(seed): extract processItemRoute for testability; add 5 integration tests
- Move item-level decision tree into processItemRoute() in _seed-utils.mjs
so it can be imported and unit-tested without triggering runSeed()
- seed-grocery-basket.mjs delegates to processItemRoute() with fetchViaExa
callback containing the existing EXA+Firecrawl block
- 5 integration tests cover: learned-hit success (EXA skipped), learned fail
+ EXA replacement, fail x2 eviction, SSRF guard (bad host blocks direct
fetch), EXA success with unlisted host (route not saved)
- Fix: move allowedHosts computation outside Promise.all (once per country)
- Fix: add [EXA->learned] log tag when new route is saved from EXA discovery
- All 21 seed-learned-routes tests pass
* fix(seed): strip path from allowedHosts entries before hostname comparison
grocery-basket.json contains "noon.com/saudi-en" for Saudi Arabia.
allowedHosts was built with only www. stripped, so the comparison
hostname === 'noon.com/saudi-en'
was always false — noon.com routes for SA were rejected or evicted
every run, preventing the cache from ever stabilizing there.
Fix: split('/')[0] after stripping www., giving bare hostname.
Add regression test: path-bearing allowlist entry matches noon.com URL.
This commit is contained in:
@@ -282,6 +282,140 @@ export function sleep(ms) {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Learned Routes — persist successful scrape URLs across seed runs
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// Validate a URL's hostname against a list of allowed domains (same list used
|
||||
// for EXA includeDomains). Prevents stored-SSRF from Redis-persisted URLs.
|
||||
export function isAllowedRouteHost(url, allowedHosts) {
|
||||
try {
|
||||
const hostname = new URL(url).hostname.replace(/^www\./, '');
|
||||
return allowedHosts.some(h => hostname === h || hostname.endsWith('.' + h));
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Batch-read all learned routes for a scope via single Upstash pipeline request.
|
||||
// Returns Map<key → routeData>. Non-fatal: throws on HTTP error (caller catches).
|
||||
export async function bulkReadLearnedRoutes(scope, keys) {
|
||||
if (!keys.length) return new Map();
|
||||
const { url, token } = getRedisCredentials();
|
||||
const pipeline = keys.map(k => ['GET', `seed-routes:${scope}:${k}`]);
|
||||
const resp = await fetch(`${url}/pipeline`, {
|
||||
method: 'POST',
|
||||
headers: { Authorization: `Bearer ${token}`, 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(pipeline),
|
||||
signal: AbortSignal.timeout(10_000),
|
||||
});
|
||||
if (!resp.ok) throw new Error(`bulkReadLearnedRoutes HTTP ${resp.status}`);
|
||||
const results = await resp.json();
|
||||
const map = new Map();
|
||||
for (let i = 0; i < keys.length; i++) {
|
||||
const raw = results[i]?.result;
|
||||
if (!raw) continue;
|
||||
try { map.set(keys[i], JSON.parse(raw)); }
|
||||
catch { console.warn(` [routes] malformed JSON for ${keys[i]} — skipping`); }
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
// Batch-write route updates and hard-delete evicted routes via single pipeline.
|
||||
// Keys in updates always win over deletes (SET/DEL conflict resolution).
|
||||
// DELs are sent before SETs to ensure correct ordering.
|
||||
export async function bulkWriteLearnedRoutes(scope, updates, deletes = new Set()) {
|
||||
const { url, token } = getRedisCredentials();
|
||||
const ROUTE_TTL = 14 * 24 * 3600; // 14 days
|
||||
const effectiveDeletes = [...deletes].filter(k => !updates.has(k));
|
||||
const pipeline = [];
|
||||
for (const k of effectiveDeletes)
|
||||
pipeline.push(['DEL', `seed-routes:${scope}:${k}`]);
|
||||
for (const [k, v] of updates)
|
||||
pipeline.push(['SET', `seed-routes:${scope}:${k}`, JSON.stringify(v), 'EX', ROUTE_TTL]);
|
||||
if (!pipeline.length) return;
|
||||
const resp = await fetch(`${url}/pipeline`, {
|
||||
method: 'POST',
|
||||
headers: { Authorization: `Bearer ${token}`, 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify(pipeline),
|
||||
signal: AbortSignal.timeout(15_000),
|
||||
});
|
||||
if (!resp.ok) throw new Error(`bulkWriteLearnedRoutes HTTP ${resp.status}`);
|
||||
console.log(` [routes] written: ${updates.size} updated, ${effectiveDeletes.length} deleted`);
|
||||
}
|
||||
|
||||
// Decision tree for a single seed item: try learned route first, fall back to EXA.
|
||||
// All external I/O is injected so this function can be unit-tested without Redis or HTTP.
|
||||
//
|
||||
// Returns: { localPrice, sourceSite, routeUpdate, routeDelete }
|
||||
// routeUpdate — route object to persist (null = nothing to write)
|
||||
// routeDelete — true if the Redis key should be hard-deleted
|
||||
export async function processItemRoute({
|
||||
learned, // route object from Redis, or undefined/null on first run
|
||||
allowedHosts, // string[] — normalised (no www.), same as EXA includeDomains
|
||||
currency, // e.g. 'AED'
|
||||
itemId, // e.g. 'sugar' — used only for log messages
|
||||
fxRate, // number | null
|
||||
itemUsdMax = null, // per-item bulk cap in USD (ITEM_USD_MAX[itemId])
|
||||
tryDirectFetch, // async (url, currency, itemId, fxRate) => number | null
|
||||
scrapeFirecrawl, // async (url, currency) => { price, source } | null
|
||||
fetchViaExa, // async () => { localPrice, sourceSite } | null (caller owns EXA+FC logic)
|
||||
sleep: sleepFn, // async ms => void
|
||||
firecrawlDelayMs = 0,
|
||||
}) {
|
||||
let localPrice = null;
|
||||
let sourceSite = '';
|
||||
let routeUpdate = null;
|
||||
let routeDelete = false;
|
||||
|
||||
if (learned) {
|
||||
if (learned.failsSinceSuccess >= 2 || !isAllowedRouteHost(learned.url, allowedHosts)) {
|
||||
routeDelete = true;
|
||||
console.log(` [learned✗] ${itemId}: evicting (${learned.failsSinceSuccess >= 2 ? '2 failures' : 'invalid host'})`);
|
||||
} else {
|
||||
localPrice = await tryDirectFetch(learned.url, currency, itemId, fxRate);
|
||||
if (localPrice !== null) {
|
||||
sourceSite = learned.url;
|
||||
routeUpdate = { ...learned, hits: learned.hits + 1, failsSinceSuccess: 0, lastSuccessAt: Date.now() };
|
||||
console.log(` [learned✓] ${itemId}: ${localPrice} ${currency}`);
|
||||
} else {
|
||||
await sleepFn(firecrawlDelayMs);
|
||||
const fc = await scrapeFirecrawl(learned.url, currency);
|
||||
const fcSkip = fc && fxRate && itemUsdMax && (fc.price * fxRate) > itemUsdMax;
|
||||
if (fc && !fcSkip) {
|
||||
localPrice = fc.price;
|
||||
sourceSite = fc.source;
|
||||
routeUpdate = { ...learned, hits: learned.hits + 1, failsSinceSuccess: 0, lastSuccessAt: Date.now() };
|
||||
console.log(` [learned-FC✓] ${itemId}: ${localPrice} ${currency}`);
|
||||
} else {
|
||||
const newFails = learned.failsSinceSuccess + 1;
|
||||
if (newFails >= 2) {
|
||||
routeDelete = true;
|
||||
console.log(` [learned✗→EXA] ${itemId}: 2 failures — evicting, retrying via EXA`);
|
||||
} else {
|
||||
routeUpdate = { ...learned, failsSinceSuccess: newFails };
|
||||
console.log(` [learned✗→EXA] ${itemId}: failed (${newFails}/2), retrying via EXA`);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (localPrice === null) {
|
||||
const exaResult = await fetchViaExa();
|
||||
if (exaResult?.localPrice != null) {
|
||||
localPrice = exaResult.localPrice;
|
||||
sourceSite = exaResult.sourceSite || '';
|
||||
if (sourceSite && isAllowedRouteHost(sourceSite, allowedHosts)) {
|
||||
routeUpdate = { url: sourceSite, lastSuccessAt: Date.now(), hits: 1, failsSinceSuccess: 0, currency };
|
||||
console.log(` [EXA->learned] ${itemId}: saved ${sourceSite.slice(0, 55)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { localPrice, sourceSite, routeUpdate, routeDelete };
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the current canonical snapshot from Redis before a seed run overwrites it.
|
||||
* Used by seed scripts that compute WoW deltas (bigmac, grocery-basket).
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { loadEnvFile, loadSharedConfig, CHROME_UA, runSeed, sleep, readSeedSnapshot } from './_seed-utils.mjs';
|
||||
import { loadEnvFile, loadSharedConfig, CHROME_UA, runSeed, sleep, readSeedSnapshot, bulkReadLearnedRoutes, bulkWriteLearnedRoutes, isAllowedRouteHost, processItemRoute } from './_seed-utils.mjs';
|
||||
|
||||
loadEnvFile(import.meta.url);
|
||||
|
||||
@@ -142,6 +142,30 @@ async function scrapeFirecrawl(url, expectedCurrency) {
|
||||
}
|
||||
}
|
||||
|
||||
// Fast learned-route replay: direct fetch + matchPrice + same guardrails as EXA/Firecrawl paths.
|
||||
// Inline (not in _seed-utils) because it closes over CURRENCY_MIN, ITEM_USD_MAX, matchPrice.
|
||||
async function tryDirectFetch(url, expectedCurrency, itemId, fxRate) {
|
||||
try {
|
||||
const resp = await fetch(url, {
|
||||
headers: { 'User-Agent': CHROME_UA },
|
||||
signal: AbortSignal.timeout(8_000),
|
||||
});
|
||||
if (!resp.ok) return null;
|
||||
const text = await resp.text();
|
||||
const hit = matchPrice(text.slice(0, 10_000), url);
|
||||
if (!hit || hit.currency !== expectedCurrency) return null;
|
||||
const minPrice = CURRENCY_MIN[expectedCurrency] ?? 0;
|
||||
if (hit.price <= minPrice || hit.price >= 100_000) return null;
|
||||
if (fxRate && ITEM_USD_MAX[itemId] && hit.price * fxRate > ITEM_USD_MAX[itemId]) {
|
||||
console.warn(` [learned bulk] ${itemId}: ${hit.price} ${expectedCurrency} ($${(hit.price * fxRate).toFixed(2)}) > max — skipping`);
|
||||
return null;
|
||||
}
|
||||
return hit.price;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// All supported currency codes — keep in sync with grocery-basket.json fxSymbols
|
||||
const CCY = 'USD|GBP|EUR|JPY|CNY|INR|AUD|CAD|BRL|MXN|ZAR|TRY|NGN|KRW|SGD|PKR|AED|SAR|QAR|KWD|BHD|OMR|EGP|JOD|LBP|KES|ARS|IDR|PHP';
|
||||
|
||||
@@ -220,62 +244,90 @@ async function fetchGroceryBasketPrices(prevSnapshot) {
|
||||
|
||||
const countriesResult = [];
|
||||
|
||||
// Load all learned routes in one pipeline request before the country loop
|
||||
const routeKeys = config.countries.flatMap(c => config.items.map(i => `${c.code}:${i.id}`));
|
||||
const learnedRoutes = await bulkReadLearnedRoutes('grocery-basket', routeKeys).catch((err) => {
|
||||
console.warn(` [routes] load failed (non-fatal): ${err.message}`);
|
||||
return new Map();
|
||||
});
|
||||
const routeUpdates = new Map();
|
||||
const routeDeletes = new Set();
|
||||
console.log(` [routes] loaded ${learnedRoutes.size} learned routes`);
|
||||
|
||||
for (const country of config.countries) {
|
||||
console.log(`\n Processing ${country.flag} ${country.name} (${country.currency})...`);
|
||||
const fxRate = fxRates[country.currency] || FX_FALLBACKS[country.currency] || null;
|
||||
const allowedHosts = country.sites.map(s => s.replace(/^www\./, '').split('/')[0]);
|
||||
|
||||
// Process all items concurrently — 100ms stagger to respect EXA/Firecrawl rate limits
|
||||
const itemPrices = await Promise.all(config.items.map(async (item, idx) => {
|
||||
await sleep(idx * 200); // stagger starts — 200ms prevents EXA rate limit with 10 concurrent
|
||||
|
||||
let localPrice = null;
|
||||
let sourceSite = '';
|
||||
const routeKey = `${country.code}:${item.id}`;
|
||||
const learned = learnedRoutes.get(routeKey);
|
||||
|
||||
let exaUrls = [];
|
||||
try {
|
||||
const exaResult = await searchExa(`${item.query} price`, country.sites, country.code);
|
||||
|
||||
if (exaResult?.results?.length) {
|
||||
exaUrls = exaResult.results.map(r => r.url).filter(Boolean);
|
||||
for (const result of exaResult.results) {
|
||||
const extracted = extractPrice(result, country.currency);
|
||||
if (!extracted) continue;
|
||||
// Reject bulk/warehouse sizes by checking USD equivalent against per-item cap
|
||||
if (fxRate && ITEM_USD_MAX[item.id]) {
|
||||
const usdEquiv = extracted.price * fxRate;
|
||||
if (usdEquiv > ITEM_USD_MAX[item.id]) {
|
||||
console.warn(` [bulk] ${item.id}: ${extracted.price} ${country.currency} ($${usdEquiv.toFixed(2)}) > max $${ITEM_USD_MAX[item.id]} — skipping`);
|
||||
continue;
|
||||
// --- Learned route fast path + EXA fallback ---
|
||||
const { localPrice, sourceSite, routeUpdate, routeDelete } = await processItemRoute({
|
||||
learned,
|
||||
allowedHosts,
|
||||
currency: country.currency,
|
||||
itemId: item.id,
|
||||
fxRate,
|
||||
itemUsdMax: ITEM_USD_MAX[item.id] || null,
|
||||
tryDirectFetch,
|
||||
scrapeFirecrawl,
|
||||
fetchViaExa: async () => {
|
||||
let exaPrice = null;
|
||||
let exaSite = '';
|
||||
let exaUrls = [];
|
||||
try {
|
||||
const exaResult = await searchExa(`${item.query} price`, country.sites, country.code);
|
||||
if (exaResult?.results?.length) {
|
||||
exaUrls = exaResult.results.map(r => r.url).filter(Boolean);
|
||||
for (const result of exaResult.results) {
|
||||
const extracted = extractPrice(result, country.currency);
|
||||
if (!extracted) continue;
|
||||
if (fxRate && ITEM_USD_MAX[item.id]) {
|
||||
const usdEquiv = extracted.price * fxRate;
|
||||
if (usdEquiv > ITEM_USD_MAX[item.id]) {
|
||||
console.warn(` [bulk] ${item.id}: ${extracted.price} ${country.currency} ($${usdEquiv.toFixed(2)}) > max $${ITEM_USD_MAX[item.id]} — skipping`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
exaPrice = extracted.price;
|
||||
exaSite = extracted.source;
|
||||
break;
|
||||
}
|
||||
}
|
||||
localPrice = extracted.price;
|
||||
sourceSite = extracted.source;
|
||||
break;
|
||||
} catch (err) {
|
||||
console.warn(` [${country.code}/${item.id}] EXA error: ${err.message}`);
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
console.warn(` [${country.code}/${item.id}] EXA error: ${err.message}`);
|
||||
}
|
||||
|
||||
// Firecrawl fallback — renders JS-heavy SPAs (noon.com, coupang, shopee, etc.)
|
||||
if (localPrice === null && exaUrls.length > 0) {
|
||||
for (const url of exaUrls.slice(0, 2)) {
|
||||
const fc = await scrapeFirecrawl(url, country.currency);
|
||||
if (!fc) continue;
|
||||
// Apply same bulk cap to Firecrawl results
|
||||
if (fxRate && ITEM_USD_MAX[item.id]) {
|
||||
const usdEquiv = fc.price * fxRate;
|
||||
if (usdEquiv > ITEM_USD_MAX[item.id]) {
|
||||
console.warn(` [FC bulk] ${item.id}: ${fc.price} ${country.currency} ($${usdEquiv.toFixed(2)}) > max — skipping`);
|
||||
continue;
|
||||
// Firecrawl fallback for EXA-discovered URLs (handles JS-heavy SPAs)
|
||||
if (exaPrice === null && exaUrls.length > 0) {
|
||||
for (const url of exaUrls.slice(0, 2)) {
|
||||
const fc = await scrapeFirecrawl(url, country.currency);
|
||||
if (!fc) continue;
|
||||
if (fxRate && ITEM_USD_MAX[item.id]) {
|
||||
const usdEquiv = fc.price * fxRate;
|
||||
if (usdEquiv > ITEM_USD_MAX[item.id]) {
|
||||
console.warn(` [FC bulk] ${item.id}: ${fc.price} ${country.currency} ($${usdEquiv.toFixed(2)}) > max — skipping`);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
exaPrice = fc.price;
|
||||
exaSite = fc.source;
|
||||
console.log(` [FC✓] ${item.id}: ${url.slice(0, 55)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
localPrice = fc.price;
|
||||
sourceSite = fc.source;
|
||||
console.log(` [FC✓] ${item.id}: ${url.slice(0, 55)}`);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return exaPrice !== null ? { localPrice: exaPrice, sourceSite: exaSite } : null;
|
||||
},
|
||||
sleep,
|
||||
firecrawlDelayMs: FIRECRAWL_DELAY_MS,
|
||||
});
|
||||
|
||||
if (routeDelete) routeDeletes.add(routeKey);
|
||||
if (routeUpdate) routeUpdates.set(routeKey, routeUpdate);
|
||||
|
||||
const usdPrice = localPrice !== null && fxRate ? +(localPrice * fxRate).toFixed(4) : null;
|
||||
const status = localPrice !== null ? `${localPrice} ${country.currency} = $${usdPrice}` : 'N/A';
|
||||
@@ -307,6 +359,11 @@ async function fetchGroceryBasketPrices(prevSnapshot) {
|
||||
});
|
||||
}
|
||||
|
||||
// Persist learned routes for next run (non-fatal)
|
||||
await bulkWriteLearnedRoutes('grocery-basket', routeUpdates, routeDeletes).catch(err =>
|
||||
console.warn(` [routes] write failed (non-fatal): ${err.message}`)
|
||||
);
|
||||
|
||||
// Only rank countries with enough items found — a country with 4/10 items
|
||||
// could appear "cheapest" purely due to missing data, not actual prices.
|
||||
const MIN_ITEMS_FOR_RANKING = Math.ceil(config.items.length * 0.7); // ≥ 70% coverage
|
||||
|
||||
321
tests/seed-learned-routes.test.mjs
Normal file
321
tests/seed-learned-routes.test.mjs
Normal file
@@ -0,0 +1,321 @@
|
||||
import assert from 'node:assert/strict';
|
||||
import { describe, it, beforeEach, afterEach } from 'node:test';
|
||||
|
||||
import { isAllowedRouteHost, bulkReadLearnedRoutes, bulkWriteLearnedRoutes, processItemRoute } from '../scripts/_seed-utils.mjs';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// isAllowedRouteHost
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('isAllowedRouteHost', () => {
|
||||
it('accepts URL matching a listed site exactly', () => {
|
||||
assert.equal(isAllowedRouteHost('https://carrefouruae.com/product/sugar', ['carrefouruae.com', 'noon.com']), true);
|
||||
});
|
||||
|
||||
it('accepts URL with www. prefix', () => {
|
||||
assert.equal(isAllowedRouteHost('https://www.carrefouruae.com/product/sugar', ['carrefouruae.com']), true);
|
||||
});
|
||||
|
||||
it('accepts subdomain of listed site', () => {
|
||||
assert.equal(isAllowedRouteHost('https://shop.luluhypermarket.com/en/sugar', ['luluhypermarket.com']), true);
|
||||
});
|
||||
|
||||
it('rejects URL from unlisted hostname', () => {
|
||||
assert.equal(isAllowedRouteHost('https://numbeo.com/cost-of-living', ['carrefouruae.com']), false);
|
||||
});
|
||||
|
||||
it('rejects malformed URL without throwing', () => {
|
||||
assert.equal(isAllowedRouteHost('not-a-url', ['carrefouruae.com']), false);
|
||||
});
|
||||
|
||||
it('rejects empty string without throwing', () => {
|
||||
assert.equal(isAllowedRouteHost('', ['carrefouruae.com']), false);
|
||||
});
|
||||
|
||||
it('accepts noon.com URL when allowedHosts entry is path-bearing (noon.com/saudi-en stripped to noon.com)', () => {
|
||||
// grocery-basket.json SA sites contains "noon.com/saudi-en" — must be stripped to bare hostname
|
||||
// before comparison, otherwise no noon.com route ever matches and SA cache never stabilizes
|
||||
const allowedHosts = ['noon.com/saudi-en', 'carrefour.com.sa'].map(s => s.split('/')[0]);
|
||||
assert.equal(isAllowedRouteHost('https://noon.com/saudi-en/sugar', allowedHosts), true);
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers — mock fetch for Redis tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function withEnv(vars) {
|
||||
const original = {};
|
||||
for (const [k, v] of Object.entries(vars)) {
|
||||
original[k] = process.env[k];
|
||||
process.env[k] = v;
|
||||
}
|
||||
return () => {
|
||||
for (const [k, v] of Object.entries(original)) {
|
||||
if (v === undefined) delete process.env[k];
|
||||
else process.env[k] = v;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
function mockFetch(handler) {
|
||||
const original = globalThis.fetch;
|
||||
globalThis.fetch = handler;
|
||||
return () => { globalThis.fetch = original; };
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// bulkReadLearnedRoutes
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('bulkReadLearnedRoutes', () => {
|
||||
let restoreEnv;
|
||||
|
||||
beforeEach(() => {
|
||||
restoreEnv = withEnv({
|
||||
UPSTASH_REDIS_REST_URL: 'https://redis.test',
|
||||
UPSTASH_REDIS_REST_TOKEN: 'tok',
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
restoreEnv();
|
||||
});
|
||||
|
||||
it('returns empty Map when keys array is empty (no fetch)', async () => {
|
||||
let fetchCalled = false;
|
||||
const restore = mockFetch(() => { fetchCalled = true; });
|
||||
const result = await bulkReadLearnedRoutes('grocery-basket', []);
|
||||
restore();
|
||||
assert.equal(fetchCalled, false);
|
||||
assert.equal(result.size, 0);
|
||||
});
|
||||
|
||||
it('parses valid pipeline responses into Map', async () => {
|
||||
const route = { url: 'https://carrefouruae.com/sugar', lastSuccessAt: 1000, hits: 3, failsSinceSuccess: 0, currency: 'AED' };
|
||||
const restore = mockFetch(async () => ({
|
||||
ok: true,
|
||||
json: async () => [
|
||||
{ result: JSON.stringify(route) },
|
||||
{ result: null },
|
||||
],
|
||||
}));
|
||||
const result = await bulkReadLearnedRoutes('grocery-basket', ['AE:sugar', 'AE:salt']);
|
||||
restore();
|
||||
assert.equal(result.size, 1);
|
||||
assert.deepEqual(result.get('AE:sugar'), route);
|
||||
assert.equal(result.has('AE:salt'), false);
|
||||
});
|
||||
|
||||
it('skips malformed JSON entries without throwing', async () => {
|
||||
const restore = mockFetch(async () => ({
|
||||
ok: true,
|
||||
json: async () => [{ result: 'not-valid-json{{' }],
|
||||
}));
|
||||
const result = await bulkReadLearnedRoutes('grocery-basket', ['AE:sugar']);
|
||||
restore();
|
||||
assert.equal(result.size, 0);
|
||||
});
|
||||
|
||||
it('throws on HTTP error (non-fatal: caller catches)', async () => {
|
||||
const restore = mockFetch(async () => ({ ok: false, status: 500 }));
|
||||
await assert.rejects(
|
||||
() => bulkReadLearnedRoutes('grocery-basket', ['AE:sugar']),
|
||||
/bulkReadLearnedRoutes HTTP 500/
|
||||
);
|
||||
restore();
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// bulkWriteLearnedRoutes
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('bulkWriteLearnedRoutes', () => {
|
||||
let restoreEnv;
|
||||
|
||||
beforeEach(() => {
|
||||
restoreEnv = withEnv({
|
||||
UPSTASH_REDIS_REST_URL: 'https://redis.test',
|
||||
UPSTASH_REDIS_REST_TOKEN: 'tok',
|
||||
});
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
restoreEnv();
|
||||
});
|
||||
|
||||
it('no-ops when both maps are empty (no fetch)', async () => {
|
||||
let fetchCalled = false;
|
||||
const restore = mockFetch(() => { fetchCalled = true; });
|
||||
await bulkWriteLearnedRoutes('grocery-basket', new Map(), new Set());
|
||||
restore();
|
||||
assert.equal(fetchCalled, false);
|
||||
});
|
||||
|
||||
it('sends SET with 14-day TTL for updated keys', async () => {
|
||||
let capturedBody;
|
||||
const restore = mockFetch(async (url, opts) => {
|
||||
capturedBody = JSON.parse(opts.body);
|
||||
return { ok: true, json: async () => [] };
|
||||
});
|
||||
const route = { url: 'https://carrefouruae.com/sugar', lastSuccessAt: 1000, hits: 1, failsSinceSuccess: 0, currency: 'AED' };
|
||||
await bulkWriteLearnedRoutes('grocery-basket', new Map([['AE:sugar', route]]), new Set());
|
||||
restore();
|
||||
assert.equal(capturedBody.length, 1);
|
||||
const [cmd, key, val, ex, ttl] = capturedBody[0];
|
||||
assert.equal(cmd, 'SET');
|
||||
assert.equal(key, 'seed-routes:grocery-basket:AE:sugar');
|
||||
assert.deepEqual(JSON.parse(val), route);
|
||||
assert.equal(ex, 'EX');
|
||||
assert.equal(ttl, 14 * 24 * 3600);
|
||||
});
|
||||
|
||||
it('sends DEL for evicted keys not in updates', async () => {
|
||||
let capturedBody;
|
||||
const restore = mockFetch(async (url, opts) => {
|
||||
capturedBody = JSON.parse(opts.body);
|
||||
return { ok: true, json: async () => [] };
|
||||
});
|
||||
await bulkWriteLearnedRoutes('grocery-basket', new Map(), new Set(['AE:sugar']));
|
||||
restore();
|
||||
assert.equal(capturedBody.length, 1);
|
||||
assert.equal(capturedBody[0][0], 'DEL');
|
||||
assert.equal(capturedBody[0][1], 'seed-routes:grocery-basket:AE:sugar');
|
||||
});
|
||||
|
||||
it('SET wins when key is in both updates and deletes — DEL not sent', async () => {
|
||||
let capturedBody;
|
||||
const restore = mockFetch(async (url, opts) => {
|
||||
capturedBody = JSON.parse(opts.body);
|
||||
return { ok: true, json: async () => [] };
|
||||
});
|
||||
const route = { url: 'https://carrefouruae.com/sugar', lastSuccessAt: 1000, hits: 1, failsSinceSuccess: 0, currency: 'AED' };
|
||||
await bulkWriteLearnedRoutes(
|
||||
'grocery-basket',
|
||||
new Map([['AE:sugar', route]]),
|
||||
new Set(['AE:sugar']) // same key
|
||||
);
|
||||
restore();
|
||||
// Only SET, no DEL
|
||||
assert.equal(capturedBody.length, 1);
|
||||
assert.equal(capturedBody[0][0], 'SET');
|
||||
});
|
||||
|
||||
it('sends DELs before SETs in pipeline', async () => {
|
||||
let capturedBody;
|
||||
const restore = mockFetch(async (url, opts) => {
|
||||
capturedBody = JSON.parse(opts.body);
|
||||
return { ok: true, json: async () => [] };
|
||||
});
|
||||
const route = { url: 'https://carrefouruae.com/salt', lastSuccessAt: 1000, hits: 1, failsSinceSuccess: 0, currency: 'AED' };
|
||||
await bulkWriteLearnedRoutes(
|
||||
'grocery-basket',
|
||||
new Map([['AE:salt', route]]),
|
||||
new Set(['AE:sugar']) // different key — both should appear
|
||||
);
|
||||
restore();
|
||||
assert.equal(capturedBody.length, 2);
|
||||
assert.equal(capturedBody[0][0], 'DEL'); // DEL first
|
||||
assert.equal(capturedBody[1][0], 'SET'); // SET second
|
||||
});
|
||||
|
||||
it('throws on HTTP error', async () => {
|
||||
const restore = mockFetch(async () => ({ ok: false, status: 503 }));
|
||||
const route = { url: 'https://carrefouruae.com/sugar', lastSuccessAt: 1000, hits: 1, failsSinceSuccess: 0, currency: 'AED' };
|
||||
await assert.rejects(
|
||||
() => bulkWriteLearnedRoutes('grocery-basket', new Map([['AE:sugar', route]]), new Set()),
|
||||
/bulkWriteLearnedRoutes HTTP 503/
|
||||
);
|
||||
restore();
|
||||
});
|
||||
});
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// processItemRoute — integration-level decision tree
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
describe('processItemRoute', () => {
|
||||
const noop = async () => {};
|
||||
const allowedHosts = ['carrefouruae.com'];
|
||||
const baseRoute = { url: 'https://carrefouruae.com/sugar', lastSuccessAt: 1000, hits: 3, failsSinceSuccess: 0, currency: 'AED' };
|
||||
const baseOpts = {
|
||||
allowedHosts,
|
||||
currency: 'AED',
|
||||
itemId: 'sugar',
|
||||
fxRate: 0.27,
|
||||
itemUsdMax: 5,
|
||||
tryDirectFetch: async () => null,
|
||||
scrapeFirecrawl: async () => null,
|
||||
fetchViaExa: async () => null,
|
||||
sleep: noop,
|
||||
firecrawlDelayMs: 0,
|
||||
};
|
||||
|
||||
it('learned-hit success: fetchViaExa not called', async () => {
|
||||
let exaCalled = false;
|
||||
const result = await processItemRoute({
|
||||
...baseOpts,
|
||||
learned: baseRoute,
|
||||
tryDirectFetch: async () => 5.50,
|
||||
fetchViaExa: async () => { exaCalled = true; return null; },
|
||||
});
|
||||
assert.equal(exaCalled, false);
|
||||
assert.equal(result.localPrice, 5.50);
|
||||
assert.equal(result.routeUpdate?.hits, 4);
|
||||
assert.equal(result.routeUpdate?.failsSinceSuccess, 0);
|
||||
assert.equal(result.routeDelete, false);
|
||||
});
|
||||
|
||||
it('learned-hit fail + EXA success: routeUpdate has new URL, hits=1', async () => {
|
||||
const result = await processItemRoute({
|
||||
...baseOpts,
|
||||
learned: baseRoute,
|
||||
tryDirectFetch: async () => null,
|
||||
scrapeFirecrawl: async () => null,
|
||||
fetchViaExa: async () => ({ localPrice: 6.00, sourceSite: 'https://carrefouruae.com/new-sugar' }),
|
||||
});
|
||||
assert.equal(result.localPrice, 6.00);
|
||||
assert.equal(result.routeUpdate?.url, 'https://carrefouruae.com/new-sugar');
|
||||
assert.equal(result.routeUpdate?.hits, 1);
|
||||
assert.equal(result.routeDelete, false);
|
||||
});
|
||||
|
||||
it('learned fail x2: routeDelete=true, routeUpdate=null, localPrice=null', async () => {
|
||||
const staleRoute = { ...baseRoute, failsSinceSuccess: 1 };
|
||||
const result = await processItemRoute({
|
||||
...baseOpts,
|
||||
learned: staleRoute,
|
||||
tryDirectFetch: async () => null,
|
||||
scrapeFirecrawl: async () => null,
|
||||
fetchViaExa: async () => null,
|
||||
});
|
||||
assert.equal(result.routeDelete, true);
|
||||
assert.equal(result.routeUpdate, null);
|
||||
assert.equal(result.localPrice, null);
|
||||
});
|
||||
|
||||
it('corrupted URL (bad host): routeDelete=true, tryDirectFetch never called (SSRF guard)', async () => {
|
||||
let directFetchCalled = false;
|
||||
const badRoute = { ...baseRoute, url: 'https://evil.com/sugar' };
|
||||
const result = await processItemRoute({
|
||||
...baseOpts,
|
||||
learned: badRoute,
|
||||
tryDirectFetch: async () => { directFetchCalled = true; return null; },
|
||||
fetchViaExa: async () => null, // EXA still runs to find a replacement
|
||||
});
|
||||
assert.equal(result.routeDelete, true);
|
||||
assert.equal(directFetchCalled, false);
|
||||
});
|
||||
|
||||
it('EXA success but host not in allowlist: price returned, route NOT saved', async () => {
|
||||
const result = await processItemRoute({
|
||||
...baseOpts,
|
||||
learned: undefined,
|
||||
fetchViaExa: async () => ({ localPrice: 5.50, sourceSite: 'https://evil.com/sugar' }),
|
||||
});
|
||||
assert.equal(result.localPrice, 5.50);
|
||||
assert.equal(result.routeUpdate, null);
|
||||
assert.equal(result.routeDelete, false);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user