diff --git a/consumer-prices-core/configs/retailers/coop_ch.yaml b/consumer-prices-core/configs/retailers/coop_ch.yaml index 676904dd4..d11291a2b 100644 --- a/consumer-prices-core/configs/retailers/coop_ch.yaml +++ b/consumer-prices-core/configs/retailers/coop_ch.yaml @@ -9,7 +9,7 @@ retailer: searchConfig: numResults: 5 - urlPathContains: /de/food/ + urlPathContains: /de/lebensmittel/ queryTemplate: "{canonicalName} Lebensmittel {market} {currency} Preis" rateLimit: diff --git a/consumer-prices-core/configs/retailers/migros_ch.yaml b/consumer-prices-core/configs/retailers/migros_ch.yaml index 284a9b944..a118f1708 100644 --- a/consumer-prices-core/configs/retailers/migros_ch.yaml +++ b/consumer-prices-core/configs/retailers/migros_ch.yaml @@ -9,7 +9,8 @@ retailer: searchConfig: numResults: 5 - urlPathContains: /de/produkt/ + # urlPathContains removed 2026-03: Exa does not index Migros pages under /de/produkt/ -- + # all 5 search results failed the domain/path check every run. Remove if Exa indexing improves. queryTemplate: "{canonicalName} Lebensmittel {market} {currency} Preis" rateLimit: diff --git a/consumer-prices-core/configs/retailers/naivas_ke.yaml b/consumer-prices-core/configs/retailers/naivas_ke.yaml index af4e1363c..3530d1208 100644 --- a/consumer-prices-core/configs/retailers/naivas_ke.yaml +++ b/consumer-prices-core/configs/retailers/naivas_ke.yaml @@ -5,7 +5,7 @@ retailer: currencyCode: KES adapter: search baseUrl: https://www.naivas.online - enabled: true + enabled: false searchConfig: numResults: 5 diff --git a/consumer-prices-core/configs/retailers/sainsburys_gb.yaml b/consumer-prices-core/configs/retailers/sainsburys_gb.yaml index fcbff5b1b..3ed0b0c4d 100644 --- a/consumer-prices-core/configs/retailers/sainsburys_gb.yaml +++ b/consumer-prices-core/configs/retailers/sainsburys_gb.yaml @@ -4,12 +4,11 @@ retailer: marketCode: gb currencyCode: GBP adapter: search - baseUrl: https://www.sainsburys.co.uk + baseUrl: https://groceries.sainsburys.co.uk enabled: true searchConfig: numResults: 5 - urlPathContains: /shop/gb/groceries/ queryTemplate: "{canonicalName} grocery {market} {currency} price" rateLimit: diff --git a/consumer-prices-core/src/jobs/scrape.ts b/consumer-prices-core/src/jobs/scrape.ts index 3727cc898..ab0b068f8 100644 --- a/consumer-prices-core/src/jobs/scrape.ts +++ b/consumer-prices-core/src/jobs/scrape.ts @@ -63,19 +63,14 @@ async function updateScrapeRun( } export async function scrapeRetailer(slug: string) { - initProviders(process.env as Record); - const config = loadRetailerConfig(slug); if (!config.enabled) { logger.info(`${slug} is disabled, skipping`); return; } - const retailerId = await getOrCreateRetailer(slug, config); - const runId = await createScrapeRun(retailerId); - - logger.info(`Run ${runId} started for ${slug}`); - + // Validate API keys before opening a scrape_run row — an early throw here + // would otherwise leave the run stuck in status='running' forever. const exaKey = (process.env.EXA_API_KEYS || process.env.EXA_API_KEY || '').split(/[\n,]+/)[0].trim(); const fcKey = process.env.FIRECRAWL_API_KEY ?? ''; @@ -84,6 +79,11 @@ export async function scrapeRetailer(slug: string) { if (!fcKey) throw new Error(`search adapter requires FIRECRAWL_API_KEY (retailer: ${slug})`); } + const retailerId = await getOrCreateRetailer(slug, config); + const runId = await createScrapeRun(retailerId); + + logger.info(`Run ${runId} started for ${slug}`); + const adapter = config.adapter === 'search' ? new SearchAdapter(new ExaProvider(exaKey), new FirecrawlProvider(fcKey)) @@ -202,22 +202,44 @@ export async function scrapeRetailer(slug: string) { [retailerId, isSuccess ? new Date() : null, status, Math.round(parseSuccessRate * 100) / 100], ); - await teardownAll(); } export async function scrapeAll() { + // initProviders is required for GenericPlaywrightAdapter (playwright/p0 adapters use the + // registry via fetchWithFallback). SearchAdapter and ExaSearchAdapter construct their own + // provider instances directly from env vars and bypass the registry. initProviders(process.env as Record); const configs = loadAllRetailerConfigs().filter((c) => c.enabled); logger.info(`Scraping ${configs.length} retailers`); - for (const c of configs) { - await scrapeRetailer(c.slug); - } + + // Run retailers in parallel: each hits a different domain so rate limits don't conflict. + // Cap at 5 concurrent to avoid saturating Firecrawl's global request limits. + const CONCURRENCY = 5; + const queue = [...configs]; + const workers = Array.from({ length: Math.min(CONCURRENCY, queue.length) }, async () => { + while (queue.length > 0) { + const cfg = queue.shift()!; + try { + await scrapeRetailer(cfg.slug); + } catch (err) { + logger.warn(`scrapeRetailer ${cfg.slug} failed: ${err}`); + } + } + }); + await Promise.all(workers); + + await teardownAll(); } async function main() { try { if (process.argv[2]) { - await scrapeRetailer(process.argv[2]); + initProviders(process.env as Record); + try { + await scrapeRetailer(process.argv[2]); + } finally { + await teardownAll(); + } } else { await scrapeAll(); }