fix(consumer-prices): parallel scraping, retailer config fixes, disable naivas (#2086)

* fix(consumer-prices): parallel scraping, retailer config fixes, disable naivas

- scrapeAll() now uses 5-worker concurrent pool (was sequential) -- cuts
  28-min run to ~5-6 min since each retailer hits a different domain
- Move initProviders/teardownAll out of scrapeRetailer() to avoid
  registry teardown race when workers share the _providers singleton
- sainsburys_gb: fix baseUrl to groceries.sainsburys.co.uk (Exa returns
  URLs from this subdomain, not www.sainsburys.co.uk); remove wrong path filter
- coop_ch: fix urlPathContains /de/food/ -> /de/lebensmittel/ (German site)
- migros_ch: remove urlPathContains /de/produkt/ (Exa doesn't index these paths)
- naivas_ke: disable (Exa returns 0 matching URLs for www.naivas.online,
  wasting 72 API calls * 6s each per run)

* fix(consumer-prices): harden scrapeRetailer against stuck runs and teardown leaks

P1: move API key validation before createScrapeRun() so a missing key never
leaves a scrape_run row stuck in status='running' forever

P2: wrap single-retailer CLI path in try/finally so teardownAll() is called
even when scrapeRetailer() throws, preventing Playwright process leaks

P2: add comment explaining why initProviders() is kept in scrapeAll() --
GenericPlaywrightAdapter (playwright/p0 adapters) uses fetchWithFallback()
from the registry; search/exa-search bypass it with their own instances

P3: add comment in migros_ch.yaml documenting why urlPathContains was removed
This commit is contained in:
Elie Habib
2026-03-23 01:31:56 +04:00
committed by GitHub
parent 6bd17fcec0
commit 47df75daab
5 changed files with 39 additions and 17 deletions

View File

@@ -9,7 +9,7 @@ retailer:
searchConfig:
numResults: 5
urlPathContains: /de/food/
urlPathContains: /de/lebensmittel/
queryTemplate: "{canonicalName} Lebensmittel {market} {currency} Preis"
rateLimit:

View File

@@ -9,7 +9,8 @@ retailer:
searchConfig:
numResults: 5
urlPathContains: /de/produkt/
# urlPathContains removed 2026-03: Exa does not index Migros pages under /de/produkt/ --
# all 5 search results failed the domain/path check every run. Remove if Exa indexing improves.
queryTemplate: "{canonicalName} Lebensmittel {market} {currency} Preis"
rateLimit:

View File

@@ -5,7 +5,7 @@ retailer:
currencyCode: KES
adapter: search
baseUrl: https://www.naivas.online
enabled: true
enabled: false
searchConfig:
numResults: 5

View File

@@ -4,12 +4,11 @@ retailer:
marketCode: gb
currencyCode: GBP
adapter: search
baseUrl: https://www.sainsburys.co.uk
baseUrl: https://groceries.sainsburys.co.uk
enabled: true
searchConfig:
numResults: 5
urlPathContains: /shop/gb/groceries/
queryTemplate: "{canonicalName} grocery {market} {currency} price"
rateLimit:

View File

@@ -63,19 +63,14 @@ async function updateScrapeRun(
}
export async function scrapeRetailer(slug: string) {
initProviders(process.env as Record<string, string>);
const config = loadRetailerConfig(slug);
if (!config.enabled) {
logger.info(`${slug} is disabled, skipping`);
return;
}
const retailerId = await getOrCreateRetailer(slug, config);
const runId = await createScrapeRun(retailerId);
logger.info(`Run ${runId} started for ${slug}`);
// Validate API keys before opening a scrape_run row — an early throw here
// would otherwise leave the run stuck in status='running' forever.
const exaKey = (process.env.EXA_API_KEYS || process.env.EXA_API_KEY || '').split(/[\n,]+/)[0].trim();
const fcKey = process.env.FIRECRAWL_API_KEY ?? '';
@@ -84,6 +79,11 @@ export async function scrapeRetailer(slug: string) {
if (!fcKey) throw new Error(`search adapter requires FIRECRAWL_API_KEY (retailer: ${slug})`);
}
const retailerId = await getOrCreateRetailer(slug, config);
const runId = await createScrapeRun(retailerId);
logger.info(`Run ${runId} started for ${slug}`);
const adapter =
config.adapter === 'search'
? new SearchAdapter(new ExaProvider(exaKey), new FirecrawlProvider(fcKey))
@@ -202,22 +202,44 @@ export async function scrapeRetailer(slug: string) {
[retailerId, isSuccess ? new Date() : null, status, Math.round(parseSuccessRate * 100) / 100],
);
await teardownAll();
}
export async function scrapeAll() {
// initProviders is required for GenericPlaywrightAdapter (playwright/p0 adapters use the
// registry via fetchWithFallback). SearchAdapter and ExaSearchAdapter construct their own
// provider instances directly from env vars and bypass the registry.
initProviders(process.env as Record<string, string>);
const configs = loadAllRetailerConfigs().filter((c) => c.enabled);
logger.info(`Scraping ${configs.length} retailers`);
for (const c of configs) {
await scrapeRetailer(c.slug);
}
// Run retailers in parallel: each hits a different domain so rate limits don't conflict.
// Cap at 5 concurrent to avoid saturating Firecrawl's global request limits.
const CONCURRENCY = 5;
const queue = [...configs];
const workers = Array.from({ length: Math.min(CONCURRENCY, queue.length) }, async () => {
while (queue.length > 0) {
const cfg = queue.shift()!;
try {
await scrapeRetailer(cfg.slug);
} catch (err) {
logger.warn(`scrapeRetailer ${cfg.slug} failed: ${err}`);
}
}
});
await Promise.all(workers);
await teardownAll();
}
async function main() {
try {
if (process.argv[2]) {
await scrapeRetailer(process.argv[2]);
initProviders(process.env as Record<string, string>);
try {
await scrapeRetailer(process.argv[2]);
} finally {
await teardownAll();
}
} else {
await scrapeAll();
}