fix(consumer-prices): parallel scraping, retailer config fixes, disable naivas (#2086)

* fix(consumer-prices): parallel scraping, retailer config fixes, disable naivas - scrapeAll() now uses 5-worker concurrent pool (was sequential) -- cuts 28-min run to ~5-6 min since each retailer hits a different domain - Move initProviders/teardownAll out of scrapeRetailer() to avoid registry teardown race when workers share the _providers singleton - sainsburys_gb: fix baseUrl to groceries.sainsburys.co.uk (Exa returns URLs from this subdomain, not www.sainsburys.co.uk); remove wrong path filter - coop_ch: fix urlPathContains /de/food/ -> /de/lebensmittel/ (German site) - migros_ch: remove urlPathContains /de/produkt/ (Exa doesn't index these paths) - naivas_ke: disable (Exa returns 0 matching URLs for www.naivas.online, wasting 72 API calls * 6s each per run) * fix(consumer-prices): harden scrapeRetailer against stuck runs and teardown leaks P1: move API key validation before createScrapeRun() so a missing key never leaves a scrape_run row stuck in status='running' forever P2: wrap single-retailer CLI path in try/finally so teardownAll() is called even when scrapeRetailer() throws, preventing Playwright process leaks P2: add comment explaining why initProviders() is kept in scrapeAll() -- GenericPlaywrightAdapter (playwright/p0 adapters) uses fetchWithFallback() from the registry; search/exa-search bypass it with their own instances P3: add comment in migros_ch.yaml documenting why urlPathContains was removed
2026-04-25 17:14:57 +02:00 · 2026-03-23 01:31:56 +04:00
parent 6bd17fcec0
commit 47df75daab
5 changed files with 39 additions and 17 deletions
--- a/consumer-prices-core/configs/retailers/coop_ch.yaml
+++ b/consumer-prices-core/configs/retailers/coop_ch.yaml
@@ -9,7 +9,7 @@ retailer:

  searchConfig:
    numResults: 5
-    urlPathContains: /de/food/
+    urlPathContains: /de/lebensmittel/
    queryTemplate: "{canonicalName} Lebensmittel {market} {currency} Preis"

  rateLimit:
--- a/consumer-prices-core/configs/retailers/migros_ch.yaml
+++ b/consumer-prices-core/configs/retailers/migros_ch.yaml
@@ -9,7 +9,8 @@ retailer:

  searchConfig:
    numResults: 5
-    urlPathContains: /de/produkt/
+    # urlPathContains removed 2026-03: Exa does not index Migros pages under /de/produkt/ --
+    # all 5 search results failed the domain/path check every run. Remove if Exa indexing improves.
    queryTemplate: "{canonicalName} Lebensmittel {market} {currency} Preis"

  rateLimit:
--- a/consumer-prices-core/configs/retailers/naivas_ke.yaml
+++ b/consumer-prices-core/configs/retailers/naivas_ke.yaml
@@ -5,7 +5,7 @@ retailer:
  currencyCode: KES
  adapter: search
  baseUrl: https://www.naivas.online
-  enabled: true
+  enabled: false

  searchConfig:
    numResults: 5
--- a/consumer-prices-core/configs/retailers/sainsburys_gb.yaml
+++ b/consumer-prices-core/configs/retailers/sainsburys_gb.yaml
@@ -4,12 +4,11 @@ retailer:
  marketCode: gb
  currencyCode: GBP
  adapter: search
-  baseUrl: https://www.sainsburys.co.uk
+  baseUrl: https://groceries.sainsburys.co.uk
  enabled: true

  searchConfig:
    numResults: 5
-    urlPathContains: /shop/gb/groceries/
    queryTemplate: "{canonicalName} grocery {market} {currency} price"

  rateLimit:
--- a/consumer-prices-core/src/jobs/scrape.ts
+++ b/consumer-prices-core/src/jobs/scrape.ts
@@ -63,19 +63,14 @@ async function updateScrapeRun(
 }

 export async function scrapeRetailer(slug: string) {
-  initProviders(process.env as Record<string, string>);
-
  const config = loadRetailerConfig(slug);
  if (!config.enabled) {
    logger.info(`${slug} is disabled, skipping`);
    return;
  }

-  const retailerId = await getOrCreateRetailer(slug, config);
-  const runId = await createScrapeRun(retailerId);
-
-  logger.info(`Run ${runId} started for ${slug}`);
-
+  // Validate API keys before opening a scrape_run row — an early throw here
+  // would otherwise leave the run stuck in status='running' forever.
  const exaKey = (process.env.EXA_API_KEYS || process.env.EXA_API_KEY || '').split(/[\n,]+/)[0].trim();
  const fcKey = process.env.FIRECRAWL_API_KEY ?? '';

@@ -84,6 +79,11 @@ export async function scrapeRetailer(slug: string) {
    if (!fcKey) throw new Error(`search adapter requires FIRECRAWL_API_KEY (retailer: ${slug})`);
  }

+  const retailerId = await getOrCreateRetailer(slug, config);
+  const runId = await createScrapeRun(retailerId);
+
+  logger.info(`Run ${runId} started for ${slug}`);
+
  const adapter =
    config.adapter === 'search'
      ? new SearchAdapter(new ExaProvider(exaKey), new FirecrawlProvider(fcKey))
@@ -202,22 +202,44 @@ export async function scrapeRetailer(slug: string) {
    [retailerId, isSuccess ? new Date() : null, status, Math.round(parseSuccessRate * 100) / 100],
  );

-  await teardownAll();
 }

 export async function scrapeAll() {
+  // initProviders is required for GenericPlaywrightAdapter (playwright/p0 adapters use the
+  // registry via fetchWithFallback). SearchAdapter and ExaSearchAdapter construct their own
+  // provider instances directly from env vars and bypass the registry.
  initProviders(process.env as Record<string, string>);
  const configs = loadAllRetailerConfigs().filter((c) => c.enabled);
  logger.info(`Scraping ${configs.length} retailers`);
-  for (const c of configs) {
-    await scrapeRetailer(c.slug);
-  }
+
+  // Run retailers in parallel: each hits a different domain so rate limits don't conflict.
+  // Cap at 5 concurrent to avoid saturating Firecrawl's global request limits.
+  const CONCURRENCY = 5;
+  const queue = [...configs];
+  const workers = Array.from({ length: Math.min(CONCURRENCY, queue.length) }, async () => {
+    while (queue.length > 0) {
+      const cfg = queue.shift()!;
+      try {
+        await scrapeRetailer(cfg.slug);
+      } catch (err) {
+        logger.warn(`scrapeRetailer ${cfg.slug} failed: ${err}`);
+      }
+    }
+  });
+  await Promise.all(workers);
+
+  await teardownAll();
 }

 async function main() {
  try {
    if (process.argv[2]) {
-      await scrapeRetailer(process.argv[2]);
+      initProviders(process.env as Record<string, string>);
+      try {
+        await scrapeRetailer(process.argv[2]);
+      } finally {
+        await teardownAll();
+      }
    } else {
      await scrapeAll();
    }