mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
fix(consumer-prices): parallel scraping, retailer config fixes, disable naivas (#2086)
* fix(consumer-prices): parallel scraping, retailer config fixes, disable naivas - scrapeAll() now uses 5-worker concurrent pool (was sequential) -- cuts 28-min run to ~5-6 min since each retailer hits a different domain - Move initProviders/teardownAll out of scrapeRetailer() to avoid registry teardown race when workers share the _providers singleton - sainsburys_gb: fix baseUrl to groceries.sainsburys.co.uk (Exa returns URLs from this subdomain, not www.sainsburys.co.uk); remove wrong path filter - coop_ch: fix urlPathContains /de/food/ -> /de/lebensmittel/ (German site) - migros_ch: remove urlPathContains /de/produkt/ (Exa doesn't index these paths) - naivas_ke: disable (Exa returns 0 matching URLs for www.naivas.online, wasting 72 API calls * 6s each per run) * fix(consumer-prices): harden scrapeRetailer against stuck runs and teardown leaks P1: move API key validation before createScrapeRun() so a missing key never leaves a scrape_run row stuck in status='running' forever P2: wrap single-retailer CLI path in try/finally so teardownAll() is called even when scrapeRetailer() throws, preventing Playwright process leaks P2: add comment explaining why initProviders() is kept in scrapeAll() -- GenericPlaywrightAdapter (playwright/p0 adapters) uses fetchWithFallback() from the registry; search/exa-search bypass it with their own instances P3: add comment in migros_ch.yaml documenting why urlPathContains was removed
This commit is contained in:
@@ -9,7 +9,7 @@ retailer:
|
||||
|
||||
searchConfig:
|
||||
numResults: 5
|
||||
urlPathContains: /de/food/
|
||||
urlPathContains: /de/lebensmittel/
|
||||
queryTemplate: "{canonicalName} Lebensmittel {market} {currency} Preis"
|
||||
|
||||
rateLimit:
|
||||
|
||||
@@ -9,7 +9,8 @@ retailer:
|
||||
|
||||
searchConfig:
|
||||
numResults: 5
|
||||
urlPathContains: /de/produkt/
|
||||
# urlPathContains removed 2026-03: Exa does not index Migros pages under /de/produkt/ --
|
||||
# all 5 search results failed the domain/path check every run. Remove if Exa indexing improves.
|
||||
queryTemplate: "{canonicalName} Lebensmittel {market} {currency} Preis"
|
||||
|
||||
rateLimit:
|
||||
|
||||
@@ -5,7 +5,7 @@ retailer:
|
||||
currencyCode: KES
|
||||
adapter: search
|
||||
baseUrl: https://www.naivas.online
|
||||
enabled: true
|
||||
enabled: false
|
||||
|
||||
searchConfig:
|
||||
numResults: 5
|
||||
|
||||
@@ -4,12 +4,11 @@ retailer:
|
||||
marketCode: gb
|
||||
currencyCode: GBP
|
||||
adapter: search
|
||||
baseUrl: https://www.sainsburys.co.uk
|
||||
baseUrl: https://groceries.sainsburys.co.uk
|
||||
enabled: true
|
||||
|
||||
searchConfig:
|
||||
numResults: 5
|
||||
urlPathContains: /shop/gb/groceries/
|
||||
queryTemplate: "{canonicalName} grocery {market} {currency} price"
|
||||
|
||||
rateLimit:
|
||||
|
||||
@@ -63,19 +63,14 @@ async function updateScrapeRun(
|
||||
}
|
||||
|
||||
export async function scrapeRetailer(slug: string) {
|
||||
initProviders(process.env as Record<string, string>);
|
||||
|
||||
const config = loadRetailerConfig(slug);
|
||||
if (!config.enabled) {
|
||||
logger.info(`${slug} is disabled, skipping`);
|
||||
return;
|
||||
}
|
||||
|
||||
const retailerId = await getOrCreateRetailer(slug, config);
|
||||
const runId = await createScrapeRun(retailerId);
|
||||
|
||||
logger.info(`Run ${runId} started for ${slug}`);
|
||||
|
||||
// Validate API keys before opening a scrape_run row — an early throw here
|
||||
// would otherwise leave the run stuck in status='running' forever.
|
||||
const exaKey = (process.env.EXA_API_KEYS || process.env.EXA_API_KEY || '').split(/[\n,]+/)[0].trim();
|
||||
const fcKey = process.env.FIRECRAWL_API_KEY ?? '';
|
||||
|
||||
@@ -84,6 +79,11 @@ export async function scrapeRetailer(slug: string) {
|
||||
if (!fcKey) throw new Error(`search adapter requires FIRECRAWL_API_KEY (retailer: ${slug})`);
|
||||
}
|
||||
|
||||
const retailerId = await getOrCreateRetailer(slug, config);
|
||||
const runId = await createScrapeRun(retailerId);
|
||||
|
||||
logger.info(`Run ${runId} started for ${slug}`);
|
||||
|
||||
const adapter =
|
||||
config.adapter === 'search'
|
||||
? new SearchAdapter(new ExaProvider(exaKey), new FirecrawlProvider(fcKey))
|
||||
@@ -202,22 +202,44 @@ export async function scrapeRetailer(slug: string) {
|
||||
[retailerId, isSuccess ? new Date() : null, status, Math.round(parseSuccessRate * 100) / 100],
|
||||
);
|
||||
|
||||
await teardownAll();
|
||||
}
|
||||
|
||||
export async function scrapeAll() {
|
||||
// initProviders is required for GenericPlaywrightAdapter (playwright/p0 adapters use the
|
||||
// registry via fetchWithFallback). SearchAdapter and ExaSearchAdapter construct their own
|
||||
// provider instances directly from env vars and bypass the registry.
|
||||
initProviders(process.env as Record<string, string>);
|
||||
const configs = loadAllRetailerConfigs().filter((c) => c.enabled);
|
||||
logger.info(`Scraping ${configs.length} retailers`);
|
||||
for (const c of configs) {
|
||||
await scrapeRetailer(c.slug);
|
||||
}
|
||||
|
||||
// Run retailers in parallel: each hits a different domain so rate limits don't conflict.
|
||||
// Cap at 5 concurrent to avoid saturating Firecrawl's global request limits.
|
||||
const CONCURRENCY = 5;
|
||||
const queue = [...configs];
|
||||
const workers = Array.from({ length: Math.min(CONCURRENCY, queue.length) }, async () => {
|
||||
while (queue.length > 0) {
|
||||
const cfg = queue.shift()!;
|
||||
try {
|
||||
await scrapeRetailer(cfg.slug);
|
||||
} catch (err) {
|
||||
logger.warn(`scrapeRetailer ${cfg.slug} failed: ${err}`);
|
||||
}
|
||||
}
|
||||
});
|
||||
await Promise.all(workers);
|
||||
|
||||
await teardownAll();
|
||||
}
|
||||
|
||||
async function main() {
|
||||
try {
|
||||
if (process.argv[2]) {
|
||||
await scrapeRetailer(process.argv[2]);
|
||||
initProviders(process.env as Record<string, string>);
|
||||
try {
|
||||
await scrapeRetailer(process.argv[2]);
|
||||
} finally {
|
||||
await teardownAll();
|
||||
}
|
||||
} else {
|
||||
await scrapeAll();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user