diff --git a/consumer-prices-core/configs/baskets/essentials_ae.yaml b/consumer-prices-core/configs/baskets/essentials_ae.yaml index 3ed839ad8..1c93164d2 100644 --- a/consumer-prices-core/configs/baskets/essentials_ae.yaml +++ b/consumer-prices-core/configs/baskets/essentials_ae.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 1400 maxBaseQty: 1600 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: cheese_processed_200g category: dairy @@ -108,6 +112,7 @@ basket: substitutionGroup: cheese_processed minBaseQty: 150 maxBaseQty: 250 + negativeTokens: ["vegan", "gouda", "cheddar", "parmesan", "mozzarella", "feta"] - id: yogurt_500g category: dairy @@ -117,3 +122,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 450 maxBaseQty: 550 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_au.yaml b/consumer-prices-core/configs/baskets/essentials_au.yaml index b6db1156f..995eadb69 100644 --- a/consumer-prices-core/configs/baskets/essentials_au.yaml +++ b/consumer-prices-core/configs/baskets/essentials_au.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 1300 maxBaseQty: 1700 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: cheese_cheddar_500g category: dairy @@ -117,3 +121,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 450 maxBaseQty: 550 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_br.yaml b/consumer-prices-core/configs/baskets/essentials_br.yaml index 27599a527..cb2058f88 100644 --- a/consumer-prices-core/configs/baskets/essentials_br.yaml +++ b/consumer-prices-core/configs/baskets/essentials_br.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 1300 maxBaseQty: 1700 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: yogurt_500g category: dairy @@ -108,3 +112,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 450 maxBaseQty: 550 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_ch.yaml b/consumer-prices-core/configs/baskets/essentials_ch.yaml index a34379b5a..96ad61739 100644 --- a/consumer-prices-core/configs/baskets/essentials_ch.yaml +++ b/consumer-prices-core/configs/baskets/essentials_ch.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 400 maxBaseQty: 600 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 1300 maxBaseQty: 1700 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: cheese_200g category: dairy @@ -117,3 +121,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 450 maxBaseQty: 550 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_gb.yaml b/consumer-prices-core/configs/baskets/essentials_gb.yaml index d2c2caaf6..75d7873f3 100644 --- a/consumer-prices-core/configs/baskets/essentials_gb.yaml +++ b/consumer-prices-core/configs/baskets/essentials_gb.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 7000 maxBaseQty: 10000 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: cheese_cheddar_400g category: dairy @@ -117,3 +121,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 450 maxBaseQty: 550 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_in.yaml b/consumer-prices-core/configs/baskets/essentials_in.yaml index da16837ce..9acd8acbd 100644 --- a/consumer-prices-core/configs/baskets/essentials_in.yaml +++ b/consumer-prices-core/configs/baskets/essentials_in.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: paneer_200g category: dairy @@ -117,3 +121,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 350 maxBaseQty: 450 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_ke.yaml b/consumer-prices-core/configs/baskets/essentials_ke.yaml index 71aa4fb15..c0e20a277 100644 --- a/consumer-prices-core/configs/baskets/essentials_ke.yaml +++ b/consumer-prices-core/configs/baskets/essentials_ke.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 1300 maxBaseQty: 1700 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: cheese_processed_200g category: dairy @@ -108,6 +112,7 @@ basket: substitutionGroup: cheese_processed minBaseQty: 150 maxBaseQty: 250 + negativeTokens: ["vegan", "gouda", "cheddar", "parmesan", "mozzarella", "feta"] - id: yogurt_500g category: dairy @@ -117,3 +122,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 450 maxBaseQty: 550 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_sa.yaml b/consumer-prices-core/configs/baskets/essentials_sa.yaml index 866bdc16b..5a4ea65cb 100644 --- a/consumer-prices-core/configs/baskets/essentials_sa.yaml +++ b/consumer-prices-core/configs/baskets/essentials_sa.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 1400 maxBaseQty: 1600 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: cheese_processed_200g category: dairy @@ -108,6 +112,7 @@ basket: substitutionGroup: cheese_processed minBaseQty: 150 maxBaseQty: 250 + negativeTokens: ["vegan", "gouda", "cheddar", "parmesan", "mozzarella", "feta"] - id: yogurt_500g category: dairy @@ -117,3 +122,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 450 maxBaseQty: 550 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_sg.yaml b/consumer-prices-core/configs/baskets/essentials_sg.yaml index e95bf218f..1aad1eeae 100644 --- a/consumer-prices-core/configs/baskets/essentials_sg.yaml +++ b/consumer-prices-core/configs/baskets/essentials_sg.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 400 maxBaseQty: 600 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_500g category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 400 maxBaseQty: 600 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 1300 maxBaseQty: 1700 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 900 maxBaseQty: 1100 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: cheese_200g category: dairy @@ -108,6 +112,7 @@ basket: substitutionGroup: cheese_processed minBaseQty: 150 maxBaseQty: 250 + negativeTokens: ["vegan", "gouda", "cheddar", "parmesan", "mozzarella", "feta"] - id: yogurt_500g category: dairy @@ -117,3 +122,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 450 maxBaseQty: 550 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/configs/baskets/essentials_us.yaml b/consumer-prices-core/configs/baskets/essentials_us.yaml index d06e1c677..b40514db6 100644 --- a/consumer-prices-core/configs/baskets/essentials_us.yaml +++ b/consumer-prices-core/configs/baskets/essentials_us.yaml @@ -72,6 +72,7 @@ basket: substitutionGroup: tomatoes minBaseQty: 800 maxBaseQty: 1200 + negativeTokens: ["chopped", "peeled", "sauce", "paste", "canned", "puree", "sundried"] - id: onions_1kg category: onions @@ -81,6 +82,7 @@ basket: substitutionGroup: onions minBaseQty: 1000 maxBaseQty: 1600 + negativeTokens: ["powder", "flakes", "rings", "pickled", "seeds", "sets"] - id: water_1_5l category: water @@ -90,6 +92,7 @@ basket: substitutionGroup: water_still minBaseQty: 6000 maxBaseQty: 10000 + negativeTokens: ["sparkling", "flavored", "flavoured"] - id: sugar_1kg category: sugar @@ -99,6 +102,7 @@ basket: substitutionGroup: sugar_white minBaseQty: 1600 maxBaseQty: 2000 + negativeTokens: ["brown", "baby", "mascavo", "sachets", "powdered"] - id: cheese_cheddar_200g category: dairy @@ -117,3 +121,4 @@ basket: substitutionGroup: yogurt_plain minBaseQty: 800 maxBaseQty: 1000 + negativeTokens: ["drink", "drinking", "plant-based", "vegan", "greek", "fruit"] diff --git a/consumer-prices-core/migrations/008_candidate_match_status.sql b/consumer-prices-core/migrations/008_candidate_match_status.sql new file mode 100644 index 000000000..9a1884733 --- /dev/null +++ b/consumer-prices-core/migrations/008_candidate_match_status.sql @@ -0,0 +1,10 @@ +-- Widen product_matches.match_status to include 'candidate' — the state written +-- for weak search hits that must not enter aggregates but whose evidence we +-- still want to keep so the next scrape doesn't re-pay the same Exa/Firecrawl +-- cost. Readers that filter on ('auto','approved') naturally exclude candidates. + +ALTER TABLE product_matches DROP CONSTRAINT IF EXISTS product_matches_match_status_check; + +ALTER TABLE product_matches + ADD CONSTRAINT product_matches_match_status_check + CHECK (match_status IN ('auto','review','approved','rejected','candidate')); diff --git a/consumer-prices-core/src/adapters/search.ts b/consumer-prices-core/src/adapters/search.ts index 7fb3eb531..dbadf77d1 100644 --- a/consumer-prices-core/src/adapters/search.ts +++ b/consumer-prices-core/src/adapters/search.ts @@ -20,6 +20,8 @@ import type { RetailerConfig } from '../config/types.js'; import type { AdapterContext, FetchResult, ParsedProduct, RetailerAdapter, Target } from './types.js'; import { MARKET_NAMES } from './market-names.js'; import { parseSize } from '../normalizers/size.js'; +import { validateSearchHit, type ValidatorResult } from './validator.js'; +import type { BasketItem } from '../config/types.js'; /** Packaging/container words that are not product identity tokens. */ const PACKAGING_WORDS = new Set(['pack', 'box', 'bag', 'container', 'bottle', 'can', 'jar', 'tin', 'set', 'kit', 'bundle']); @@ -93,12 +95,16 @@ interface ExtractedProduct { sizeText?: string; } +type ItemConstraints = Pick; + interface SearchPayload { extracted: ExtractedProduct; productUrl: string; canonicalName: string; basketSlug: string; itemCategory: string; + itemConstraints: ItemConstraints; + validator?: ValidatorResult; direct?: boolean; pinnedProductId?: string; matchId?: string; @@ -127,6 +133,13 @@ export class SearchAdapter implements RetailerAdapter { for (const item of basket.items) { const pinKey = `${basket.slug}:${item.canonicalName}`; const pinned = ctx.pinnedUrls?.get(pinKey); + const itemConstraints: ItemConstraints = { + baseUnit: item.baseUnit, + minBaseQty: item.minBaseQty, + maxBaseQty: item.maxBaseQty, + negativeTokens: item.negativeTokens, + substitutionGroup: item.substitutionGroup, + }; if (pinned && isAllowedHost(pinned.sourceUrl, domain)) { targets.push({ @@ -138,6 +151,7 @@ export class SearchAdapter implements RetailerAdapter { domain, basketSlug: basket.slug, currency: ctx.config.currencyCode, + itemConstraints, direct: true, pinnedProductId: pinned.productId, matchId: pinned.matchId, @@ -156,6 +170,7 @@ export class SearchAdapter implements RetailerAdapter { domain, basketSlug: basket.slug, currency: ctx.config.currencyCode, + itemConstraints, direct: false, }, }); @@ -171,7 +186,8 @@ export class SearchAdapter implements RetailerAdapter { url: string, canonicalName: string, currency: string, - ): Promise { + itemConstraints?: ItemConstraints, + ): Promise<{ extracted: ExtractedProduct; validator: ValidatorResult } | null> { const sizeHint = extractSizeHint(canonicalName); const sizeClause = sizeHint ? ` You are looking for "${canonicalName}". The product MUST be ${sizeHint}. If the page shows a different size, pack count, or bulk case, return null for price.` @@ -195,7 +211,25 @@ export class SearchAdapter implements RetailerAdapter { if (typeof price !== 'number' || !Number.isFinite(price) || price <= 0) { return null; } - if (!isTitlePlausible(canonicalName, data.productName)) { + const legacyPass = isTitlePlausible(canonicalName, data.productName); + const validator = validateSearchHit({ + canonicalName, + productName: data.productName, + sizeText: data.sizeText, + item: itemConstraints ?? { baseUnit: '' }, + }); + + // Shadow-mode: the strict validator runs alongside the legacy boolean gate + // but does NOT block a hit on its own yet. When validator.ok=false and + // legacy would have accepted, log a wouldReject with reasons so the diff + // report can inform the rollout decision to flip the hard gate. + if (legacyPass && !validator.ok) { + ctx.logger.warn( + ` [search:shadow-reject] "${canonicalName}" would reject productName="${data.productName}" reasons=${validator.reasons.join(',')} score=${validator.score.toFixed(2)}`, + ); + } + + if (!legacyPass) { return null; } @@ -206,15 +240,16 @@ export class SearchAdapter implements RetailerAdapter { data.inStock = true; } - return data; + return { extracted: data, validator }; } async fetchTarget(ctx: AdapterContext, target: Target): Promise { - const { canonicalName, domain, currency, basketSlug, direct, pinnedProductId, matchId } = target.metadata as { + const { canonicalName, domain, currency, basketSlug, itemConstraints, direct, pinnedProductId, matchId } = target.metadata as { canonicalName: string; domain: string; currency: string; basketSlug: string; + itemConstraints: ItemConstraints; direct: boolean; pinnedProductId?: string; matchId?: string; @@ -223,19 +258,21 @@ export class SearchAdapter implements RetailerAdapter { // Direct path: skip Exa, call Firecrawl on pinned URL if (direct) { try { - const extracted = await this._extractFromUrl(ctx, target.url, canonicalName, currency); - if (extracted) { + const result = await this._extractFromUrl(ctx, target.url, canonicalName, currency, itemConstraints); + if (result) { ctx.logger.info( - ` [search:pin] ${canonicalName}: price=${extracted.price} ${extracted.currency} from ${target.url}`, + ` [search:pin] ${canonicalName}: price=${result.extracted.price} ${result.extracted.currency} from ${target.url}`, ); return { url: target.url, html: JSON.stringify({ - extracted, + extracted: result.extracted, productUrl: target.url, canonicalName, basketSlug, itemCategory: target.category, + itemConstraints, + validator: result.validator, direct: true, pinnedProductId, matchId, @@ -286,15 +323,15 @@ export class SearchAdapter implements RetailerAdapter { } // Stage 2: Firecrawl structured extraction — iterate safe URLs until one yields a valid price - let extracted: ExtractedProduct | null = null; + let picked: { extracted: ExtractedProduct; validator: ValidatorResult } | null = null; let usedUrl = safeUrls[0]; const lastErrors: string[] = []; for (const url of safeUrls) { try { - const result = await this._extractFromUrl(ctx, url, canonicalName, currency); + const result = await this._extractFromUrl(ctx, url, canonicalName, currency, itemConstraints); if (result) { - extracted = result; + picked = result; usedUrl = url; break; } @@ -306,24 +343,26 @@ export class SearchAdapter implements RetailerAdapter { } } - if (extracted === null) { + if (picked === null) { throw new Error( `All ${safeUrls.length} URLs failed extraction for "${canonicalName}".${lastErrors.length ? ` Last: ${lastErrors.at(-1)}` : ''}`, ); } ctx.logger.info( - ` [search:extract] ${canonicalName}: price=${extracted.price} ${extracted.currency} from ${usedUrl}`, + ` [search:extract] ${canonicalName}: price=${picked.extracted.price} ${picked.extracted.currency} from ${usedUrl}`, ); return { url: usedUrl, html: JSON.stringify({ - extracted, + extracted: picked.extracted, productUrl: usedUrl, canonicalName, basketSlug, itemCategory: target.category, + itemConstraints, + validator: picked.validator, direct: false, } satisfies SearchPayload), statusCode: 200, @@ -332,7 +371,7 @@ export class SearchAdapter implements RetailerAdapter { } async parseListing(ctx: AdapterContext, result: FetchResult): Promise { - const { extracted, productUrl, canonicalName, basketSlug, itemCategory, direct, pinnedProductId, matchId } = + const { extracted, productUrl, canonicalName, basketSlug, itemCategory, itemConstraints, validator, direct, pinnedProductId, matchId } = JSON.parse(result.html) as SearchPayload; const priceResult = z.number().positive().finite().safeParse(extracted?.price); @@ -371,7 +410,7 @@ export class SearchAdapter implements RetailerAdapter { // inStock defaults to true when Firecrawl does not return the field. // This is a conservative assumption — monitor for out-of-stock false positives. inStock: extracted.inStock ?? true, - rawPayload: { extracted, basketSlug, itemCategory, canonicalName, direct, pinnedProductId, matchId }, + rawPayload: { extracted, basketSlug, itemCategory, canonicalName, itemConstraints, validator, direct, pinnedProductId, matchId }, }, ]; } diff --git a/consumer-prices-core/src/adapters/validator.test.ts b/consumer-prices-core/src/adapters/validator.test.ts new file mode 100644 index 000000000..3c6803768 --- /dev/null +++ b/consumer-prices-core/src/adapters/validator.test.ts @@ -0,0 +1,294 @@ +import { describe, it, expect } from 'vitest'; +import { validateSearchHit, AUTO_MATCH_THRESHOLD } from './validator.js'; +import type { BasketItem } from '../config/types.js'; + +const item = (over: Partial = {}): BasketItem => ({ + id: 'x', + category: 'x', + canonicalName: 'x', + weight: 0.1, + baseUnit: 'g', + ...over, +}); + +describe('validateSearchHit — known bad log examples', () => { + it('rejects mango sugar baby for White Sugar 1kg', () => { + const r = validateSearchHit({ + canonicalName: 'White Sugar 1kg', + productName: 'mango sugar baby india 1 kg', + sizeText: '1 kg', + item: item({ + baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100, + negativeTokens: ['baby', 'brown', 'mascavo', 'sachets'], + }), + }); + expect(r.ok).toBe(false); + expect(r.reasons.some((s) => s.startsWith('negative-token:baby'))).toBe(true); + }); + + it('rejects vegan gouda for Processed Cheese Slices', () => { + const r = validateSearchHit({ + canonicalName: 'Processed Cheese Slices 200g', + productName: 'vegan gouda slices 200g', + sizeText: '200 g', + item: item({ + baseUnit: 'g', minBaseQty: 180, maxBaseQty: 220, + negativeTokens: ['vegan', 'gouda', 'cheddar'], + }), + }); + expect(r.ok).toBe(false); + expect(r.reasons.some((s) => s.startsWith('negative-token:vegan'))).toBe(true); + }); + + it('rejects onion powder for Onions 1kg', () => { + const r = validateSearchHit({ + canonicalName: 'Onions 1kg', + productName: 'Onion Powder 100g', + sizeText: '100 g', + item: item({ + baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100, + negativeTokens: ['powder', 'flakes'], + }), + }); + expect(r.ok).toBe(false); + expect(r.reasons.some((s) => s.startsWith('negative-token:powder'))).toBe(true); + }); + + it('rejects chopped canned tomatoes for Tomatoes Fresh 1kg', () => { + const r = validateSearchHit({ + canonicalName: 'Tomatoes Fresh 1kg', + productName: 'Chopped Tomatoes 400g canned', + sizeText: '400 g', + item: item({ + baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100, + negativeTokens: ['chopped', 'peeled', 'sauce', 'paste', 'canned'], + }), + }); + expect(r.ok).toBe(false); + expect(r.reasons.some((s) => s.startsWith('negative-token:'))).toBe(true); + }); + + it('rejects plant-based yogurt for Plain Yogurt 500g', () => { + const r = validateSearchHit({ + canonicalName: 'Plain Yogurt 500g', + productName: 'Plant-Based Almond Yogurt 500g', + sizeText: '500 g', + item: item({ + baseUnit: 'g', minBaseQty: 450, maxBaseQty: 550, + negativeTokens: ['drink', 'drinking', 'plant-based', 'vegan'], + }), + }); + expect(r.ok).toBe(false); + expect(r.reasons.some((s) => s.startsWith('negative-token:plant-based'))).toBe(true); + }); + + it('rejects drinking yogurt for Plain Yogurt 500g', () => { + const r = validateSearchHit({ + canonicalName: 'Plain Yogurt 500g', + productName: 'Dairy Drinking Yogurt 500g', + sizeText: '500 g', + item: item({ + baseUnit: 'g', minBaseQty: 450, maxBaseQty: 550, + negativeTokens: ['drink', 'drinking', 'plant-based', 'vegan'], + }), + }); + expect(r.ok).toBe(false); + expect(r.reasons.some((s) => s.startsWith('negative-token:drinking'))).toBe(true); + }); +}); + +describe('validateSearchHit — positive counterparts must still pass', () => { + it('accepts normal white sugar 1kg', () => { + const r = validateSearchHit({ + canonicalName: 'White Sugar 1kg', + productName: 'Al Khaleej White Sugar 1 kg', + sizeText: '1 kg', + item: item({ + baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100, + negativeTokens: ['brown', 'baby', 'mascavo', 'sachets', 'powdered'], + }), + }); + expect(r.ok).toBe(true); + expect(r.signals.sizeWindow).toBe('pass'); + expect(r.score).toBeGreaterThanOrEqual(AUTO_MATCH_THRESHOLD); + }); + + // Regression: "cane" is a legitimate descriptor for white cane sugar. + // An earlier iteration of negativeTokens included "cane" and would have + // downgraded real SKUs to candidate. Guard against any future edit that + // re-adds "cane" without considering this positive case. + it('accepts white cane sugar 1kg — cane is not a class error', () => { + const r = validateSearchHit({ + canonicalName: 'White Sugar 1kg', + productName: 'Silver Spoon White Cane Sugar 1kg', + sizeText: '1 kg', + item: item({ + baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100, + negativeTokens: ['brown', 'baby', 'mascavo', 'sachets', 'powdered'], + }), + }); + expect(r.ok).toBe(true); + expect(r.score).toBeGreaterThanOrEqual(AUTO_MATCH_THRESHOLD); + }); + + it('accepts fresh whole onions 1kg', () => { + const r = validateSearchHit({ + canonicalName: 'Onions 1kg', + productName: 'Fresh Red Onions 1kg', + sizeText: '1kg', + item: item({ + baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100, + negativeTokens: ['powder', 'flakes'], + }), + }); + expect(r.ok).toBe(true); + expect(r.score).toBeGreaterThanOrEqual(AUTO_MATCH_THRESHOLD); + }); + + // Regression: compact size tokens like "1kg" used to be kept as identity + // tokens, but Firecrawl often emits "1 kg" (spaced) which tokenises to + // ["1","kg"] — both below the length>2 floor — so "1kg" could never + // match. For short canonical names like "Onions 1kg", that dropped the + // token overlap from 1.0 to 0.5 and pushed valid hits below the + // AUTO_MATCH_THRESHOLD. Size fidelity is already enforced by the + // quantity-window check; identity tokens should ignore size. + it('overlap ignores compact size token so spaced-size extractions pass', () => { + const r = validateSearchHit({ + canonicalName: 'Onions 1kg', + productName: 'Fresh Red Onions 1 kg', + sizeText: '1 kg', + item: item({ + baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100, + negativeTokens: ['powder', 'flakes'], + }), + }); + expect(r.ok).toBe(true); + expect(r.signals.tokenOverlap).toBe(1); + expect(r.score).toBeGreaterThanOrEqual(AUTO_MATCH_THRESHOLD); + }); + + it('accepts fresh tomatoes 1kg', () => { + const r = validateSearchHit({ + canonicalName: 'Tomatoes Fresh 1kg', + productName: 'Fresh Tomatoes 1kg', + sizeText: '1 kg', + item: item({ + baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100, + negativeTokens: ['chopped', 'peeled', 'sauce', 'paste', 'canned'], + }), + }); + expect(r.ok).toBe(true); + expect(r.score).toBeGreaterThanOrEqual(AUTO_MATCH_THRESHOLD); + }); + + it('accepts normal plain yogurt 500g', () => { + const r = validateSearchHit({ + canonicalName: 'Plain Yogurt 500g', + productName: 'Al Ain Plain Yogurt 500g', + sizeText: '500 g', + item: item({ + baseUnit: 'g', minBaseQty: 450, maxBaseQty: 550, + negativeTokens: ['drink', 'drinking', 'plant-based', 'vegan'], + }), + }); + expect(r.ok).toBe(true); + expect(r.score).toBeGreaterThanOrEqual(AUTO_MATCH_THRESHOLD); + }); + + it('accepts processed cheese slices 200g', () => { + const r = validateSearchHit({ + canonicalName: 'Processed Cheese Slices 200g', + productName: 'Kraft Processed Cheese Slices 200g', + sizeText: '200g', + item: item({ + baseUnit: 'g', minBaseQty: 180, maxBaseQty: 220, + negativeTokens: ['vegan', 'gouda', 'cheddar'], + }), + }); + expect(r.ok).toBe(true); + expect(r.score).toBeGreaterThanOrEqual(AUTO_MATCH_THRESHOLD); + }); +}); + +describe('validateSearchHit — quantity window', () => { + it('rejects 400g for a 500g target outside the allowed window', () => { + const r = validateSearchHit({ + canonicalName: 'Plain Yogurt 500g', + productName: 'Plain Yogurt 400g', + sizeText: '400g', + item: item({ baseUnit: 'g', minBaseQty: 450, maxBaseQty: 550 }), + }); + expect(r.ok).toBe(false); + expect(r.signals.sizeWindow).toBe('fail'); + expect(r.reasons.some((s) => s.startsWith('size-window-fail'))).toBe(true); + }); + + it('rejects 2.5kg for a 1kg target', () => { + const r = validateSearchHit({ + canonicalName: 'White Sugar 1kg', + productName: 'White Sugar 2.5 kg', + sizeText: '2.5 kg', + item: item({ baseUnit: 'g', minBaseQty: 900, maxBaseQty: 1100 }), + }); + expect(r.ok).toBe(false); + expect(r.signals.sizeWindow).toBe('fail'); + }); + + it('accepts 505g for a 500g target inside the window', () => { + const r = validateSearchHit({ + canonicalName: 'Plain Yogurt 500g', + productName: 'Plain Yogurt 505g', + sizeText: '505g', + item: item({ baseUnit: 'g', minBaseQty: 450, maxBaseQty: 550 }), + }); + expect(r.ok).toBe(true); + expect(r.signals.sizeWindow).toBe('pass'); + }); + + it('treats unknown size as neutral (does not hard-fail)', () => { + const r = validateSearchHit({ + canonicalName: 'Plain Yogurt 500g', + productName: 'Plain Yogurt', + sizeText: undefined, + item: item({ baseUnit: 'g', minBaseQty: 450, maxBaseQty: 550 }), + }); + expect(r.signals.sizeWindow).toBe('unknown'); + expect(r.ok).toBe(true); + }); +}); + +describe('validateSearchHit — non-food and token overlap', () => { + it('rejects seeds for a vegetable basket item', () => { + const r = validateSearchHit({ + canonicalName: 'Tomatoes Fresh 1kg', + productName: 'GGOOT Tomato Seeds 100 pcs Vegetable Garden', + sizeText: undefined, + item: item({ baseUnit: 'g' }), + }); + expect(r.ok).toBe(false); + expect(r.signals.nonFoodIndicatorHit).toBe('seeds'); + }); + + it('rejects low token overlap', () => { + const r = validateSearchHit({ + canonicalName: 'Basmati Rice 1kg', + productName: 'Olive Oil 500ml', + sizeText: '500ml', + item: item({ baseUnit: 'g' }), + }); + expect(r.ok).toBe(false); + expect(r.reasons.some((s) => s.startsWith('low-token-overlap'))).toBe(true); + }); + + it('returns empty-product-name reason for missing productName', () => { + const r = validateSearchHit({ + canonicalName: 'Milk 1L', + productName: undefined, + sizeText: undefined, + item: item(), + }); + expect(r.ok).toBe(false); + expect(r.reasons).toContain('empty-product-name'); + expect(r.score).toBe(0); + }); +}); diff --git a/consumer-prices-core/src/adapters/validator.ts b/consumer-prices-core/src/adapters/validator.ts new file mode 100644 index 000000000..f7b231659 --- /dev/null +++ b/consumer-prices-core/src/adapters/validator.ts @@ -0,0 +1,169 @@ +/** + * Structured search-hit validator — deterministic post-extraction gate that + * replaces the boolean `isTitlePlausible` check for scoring and candidate + * triage. Evaluates: + * 1. class-error rejects (basket item's negativeTokens present in title) + * 2. non-food indicator rejects (shared with legacy gate) + * 3. token-overlap score (identity tokens from canonicalName vs productName) + * 4. quantity-window conformance (minBaseQty <= extractedBase <= maxBaseQty) + * + * Score is a 0..1 float combining the three positive signals so callers can + * make graduated decisions (auto vs candidate) instead of the legacy 1.0 shortcut. + * Reasons are returned so shadow mode and evidence_json can be human-readable. + */ +import { parseSize } from '../normalizers/size.js'; +import type { BasketItem } from '../config/types.js'; + +export interface ValidatorInput { + canonicalName: string; + productName: string | undefined; + sizeText: string | undefined; + item: Pick; +} + +export interface ValidatorResult { + ok: boolean; + score: number; + reasons: string[]; + signals: { + tokenOverlap: number; + negativeTokenHit: string | null; + nonFoodIndicatorHit: string | null; + sizeWindow: 'pass' | 'fail' | 'unknown'; + extractedBaseQty: number | null; + }; +} + +const PACKAGING_WORDS = new Set([ + 'pack', 'box', 'bag', 'container', 'bottle', 'can', 'jar', 'tin', 'set', 'kit', 'bundle', +]); + +const NON_FOOD_INDICATORS = new Set([ + 'seeds', 'seed', 'seedling', 'seedlings', 'planting', 'fertilizer', 'fertiliser', +]); + +function stem(w: string): string { + return w.replace(/ies$/, 'y').replace(/es$/, '').replace(/s$/, ''); +} + +function tokens(s: string): string[] { + return s.toLowerCase().split(/\W+/).filter(Boolean); +} + +// Compact size tokens (e.g. "1kg", "500g", "250ml", "12pk") must be stripped +// from identity tokens. The quantity-window check already handles size +// fidelity. Carrying them here creates systematic false misses because +// Firecrawl usually emits size spaced ("1 kg"), which tokenises to +// ["1","kg"] — both below the length>2 floor — so the "1kg" token can +// never match. For short canonical names like "Onions 1kg" that drops +// overlap from 1.0 to 0.5 and pushes valid hits below AUTO_MATCH_THRESHOLD. +const SIZE_LIKE = /^\d+(?:\.\d+)?[a-z]+$/; + +function identityTokens(canonicalName: string): string[] { + return tokens(canonicalName).filter( + (w) => w.length > 2 && !PACKAGING_WORDS.has(w) && !SIZE_LIKE.test(w), + ); +} + +function computeTokenOverlap(canonicalName: string, productName: string): number { + const ids = identityTokens(canonicalName); + if (ids.length === 0) return 1; + const haystack = productName.toLowerCase(); + const hits = ids.filter((w) => { + if (haystack.includes(w)) return true; + const s = stem(w); + return s.length >= 4 && s !== w && haystack.includes(s); + }); + return hits.length / ids.length; +} + +function findNegativeToken(productName: string, negativeTokens: readonly string[] | undefined): string | null { + if (!negativeTokens || negativeTokens.length === 0) return null; + const titleTokens = new Set(tokens(productName)); + const lowered = productName.toLowerCase(); + for (const raw of negativeTokens) { + const t = raw.toLowerCase().trim(); + if (!t) continue; + // Multi-word entries (e.g. "plant-based") are substring-matched; single + // words use whole-token match so "pastelaria" never matches "past". + if (t.includes(' ') || t.includes('-')) { + if (lowered.includes(t)) return raw; + } else if (titleTokens.has(t)) { + return raw; + } + } + return null; +} + +function findNonFoodIndicator(productName: string): string | null { + for (const w of tokens(productName)) { + if (NON_FOOD_INDICATORS.has(w)) return w; + } + return null; +} + +function evaluateSizeWindow( + sizeText: string | undefined, + item: ValidatorInput['item'], +): { status: 'pass' | 'fail' | 'unknown'; baseQty: number | null } { + if (item.minBaseQty == null && item.maxBaseQty == null) return { status: 'unknown', baseQty: null }; + if (!sizeText) return { status: 'unknown', baseQty: null }; + const parsed = parseSize(sizeText); + if (!parsed) return { status: 'unknown', baseQty: null }; + if (parsed.baseUnit !== item.baseUnit) return { status: 'unknown', baseQty: parsed.baseQuantity }; + const min = item.minBaseQty ?? 0; + const max = item.maxBaseQty ?? Number.POSITIVE_INFINITY; + const q = parsed.baseQuantity; + return { status: q >= min && q <= max ? 'pass' : 'fail', baseQty: q }; +} + +export function validateSearchHit(input: ValidatorInput): ValidatorResult { + const reasons: string[] = []; + const signals: ValidatorResult['signals'] = { + tokenOverlap: 0, + negativeTokenHit: null, + nonFoodIndicatorHit: null, + sizeWindow: 'unknown', + extractedBaseQty: null, + }; + + if (!input.productName) { + reasons.push('empty-product-name'); + return { ok: false, score: 0, reasons, signals }; + } + + const nonFood = findNonFoodIndicator(input.productName); + signals.nonFoodIndicatorHit = nonFood; + if (nonFood) reasons.push(`non-food-indicator:${nonFood}`); + + const negHit = findNegativeToken(input.productName, input.item.negativeTokens); + signals.negativeTokenHit = negHit; + if (negHit) reasons.push(`negative-token:${negHit}`); + + const overlap = computeTokenOverlap(input.canonicalName, input.productName); + signals.tokenOverlap = overlap; + const overlapFloor = 0.4; + if (overlap < overlapFloor) reasons.push(`low-token-overlap:${overlap.toFixed(2)}`); + + const sizeEval = evaluateSizeWindow(input.sizeText, input.item); + signals.sizeWindow = sizeEval.status; + signals.extractedBaseQty = sizeEval.baseQty; + if (sizeEval.status === 'fail') { + reasons.push(`size-window-fail:${sizeEval.baseQty}${input.item.baseUnit ?? ''}`); + } + + // Hard-reject conditions (any single one fails the hit): + const hardFail = Boolean(nonFood) || Boolean(negHit) || overlap < overlapFloor || sizeEval.status === 'fail'; + + // Score combines positive signals even when hard-failing, so candidate rows + // retain their relative quality for later review. + // Weights: token overlap 0.55, size 0.35 (or 0.2 neutral when unknown), class-clean 0.10. + const sizeComponent = sizeEval.status === 'pass' ? 0.35 : sizeEval.status === 'unknown' ? 0.2 : 0; + const classClean = nonFood || negHit ? 0 : 0.1; + const score = Math.min(1, Math.max(0, overlap * 0.55 + sizeComponent + classClean)); + + return { ok: !hardFail, score, reasons, signals }; +} + +/** Exported for tests + metrics bucketing. */ +export const AUTO_MATCH_THRESHOLD = 0.75; diff --git a/consumer-prices-core/src/config/types.ts b/consumer-prices-core/src/config/types.ts index 655b73072..0f6f0c2a9 100644 --- a/consumer-prices-core/src/config/types.ts +++ b/consumer-prices-core/src/config/types.ts @@ -100,6 +100,11 @@ export const BasketItemSchema = z.object({ substitutionGroup: z.string().optional(), minBaseQty: z.number().optional(), maxBaseQty: z.number().optional(), + // Lowercase tokens that, if present in an extracted productName, mark the hit + // as a class mismatch (e.g. "canned" for fresh tomatoes). Intended for obvious + // class errors; product-taxonomy distinctions like plain vs greek yogurt + // belong in separate substitutionGroup values, not here. + negativeTokens: z.array(z.string()).optional(), qualificationRules: z.record(z.string(), z.unknown()).optional(), }); diff --git a/consumer-prices-core/src/db/queries/matches.ts b/consumer-prices-core/src/db/queries/matches.ts index d651e9c46..6bc2688cf 100644 --- a/consumer-prices-core/src/db/queries/matches.ts +++ b/consumer-prices-core/src/db/queries/matches.ts @@ -5,24 +5,50 @@ export async function upsertProductMatch(input: { canonicalProductId: string; basketItemId: string; matchScore: number; - matchStatus: 'auto' | 'approved'; + matchStatus: 'auto' | 'approved' | 'candidate'; + evidence?: Record; }): Promise { await query( `INSERT INTO product_matches (retailer_product_id, canonical_product_id, basket_item_id, match_score, match_status, evidence_json) - VALUES ($1,$2,$3,$4,$5,'{}') + VALUES ($1,$2,$3,$4,$5,$6) ON CONFLICT (retailer_product_id, canonical_product_id) DO UPDATE SET basket_item_id = EXCLUDED.basket_item_id, match_score = EXCLUDED.match_score, - match_status = EXCLUDED.match_status, - pin_disabled_at = NULL`, + -- Curated states are immutable via the scrape upsert: + -- 'approved' — human accepted the match + -- 'review' — validate-job quarantined on price outlier, or + -- human sent it back for review (see jobs/validate.ts) + -- 'rejected' — human explicitly blocked this URL + -- Conflict key is (retailer_product_id, canonical_product_id), so + -- rediscovery is the normal path for these rows. Without this + -- guard a re-scrape writes 'auto' or 'candidate' and silently + -- re-enables a previously quarantined URL in aggregate queries + -- (aggregate.ts / snapshots filter on ('auto','approved')). + -- Only machine-written states ('auto', 'candidate') are allowed + -- to move to the fresh validator verdict. + match_status = CASE + WHEN product_matches.match_status IN ('approved', 'review', 'rejected') + THEN product_matches.match_status + ELSE EXCLUDED.match_status + END, + evidence_json = EXCLUDED.evidence_json, + -- Only clear pin_disabled_at when the row is actually moving back + -- to a machine-writable state. A 'review'/'rejected' row keeps + -- its disabled flag until the review workflow resolves it. + pin_disabled_at = CASE + WHEN product_matches.match_status IN ('review', 'rejected') + THEN product_matches.pin_disabled_at + ELSE NULL + END`, [ input.retailerProductId, input.canonicalProductId, input.basketItemId, input.matchScore, input.matchStatus, + JSON.stringify(input.evidence ?? {}), ], ); // Reset stale counters when Exa re-discovers a product — fresh match means the URL works. diff --git a/consumer-prices-core/src/jobs/scrape.ts b/consumer-prices-core/src/jobs/scrape.ts index 3ff72dae6..95be38564 100644 --- a/consumer-prices-core/src/jobs/scrape.ts +++ b/consumer-prices-core/src/jobs/scrape.ts @@ -16,6 +16,7 @@ import { FirecrawlProvider } from '../acquisition/firecrawl.js'; import type { AdapterContext } from '../adapters/types.js'; import { upsertCanonicalProduct } from '../db/queries/products.js'; import { getBasketItemId, getPinnedUrlsForRetailer, upsertProductMatch } from '../db/queries/matches.js'; +import { AUTO_MATCH_THRESHOLD, type ValidatorResult } from '../adapters/validator.js'; const logger = { info: (msg: string, ...args: unknown[]) => console.log(`[scrape] ${msg}`, ...args), @@ -144,6 +145,29 @@ export async function scrapeRetailer(slug: string) { // so this correctly distinguishes "pin worked" from "pin failed, Exa used instead". const wasDirectHit = isDirect && product.rawPayload.direct === true; + // Direct-hit validator enforcement — the pin path's common steady + // state. The legacy isTitlePlausible gate inside _extractFromUrl + // already let this hit through, so the strict validator here acts + // as a second opinion that specifically catches pins that have + // drifted onto the wrong product (e.g. "White Sugar 1kg" now + // resolving to "mango sugar baby india"). If the validator + // disagrees, skip the observation entirely and route this target + // through the existing pin-error counter so the pin soft-disables + // after repeated failures. Aggregates never see the bad price. + if (wasDirectHit) { + const v = product.rawPayload.validator as ValidatorResult | undefined; + if (v && !v.ok) { + logger.warn( + ` [${target.id}] pin validator reject — skipping observation, counting as pin error. reasons=${v.reasons.join(',')} score=${v.score.toFixed(2)} title="${product.rawTitle}"`, + ); + errorsCount++; + if (pinnedProductId && pinnedMatchId) { + await handlePinError(pinnedProductId, pinnedMatchId, target.id); + } + continue; + } + } + const productId = await upsertRetailerProduct({ retailerId, retailerSku: product.retailerSku, @@ -220,12 +244,31 @@ export async function scrapeRetailer(slug: string) { product.rawPayload.canonicalName as string, ); if (basketItemId) { + // Use the validator result threaded through the adapter payload + // to pick the match state. No validator = legacy fallback at + // score 1.0 / auto (keeps the pre-validator adapters working + // unchanged). The strict path scores real hits and downgrades + // weak ones to 'candidate' so they never enter aggregates. + const validator = product.rawPayload.validator as ValidatorResult | undefined; + const hasValidator = validator != null; + const score = hasValidator ? validator.score : 1.0; + const status: 'auto' | 'candidate' = + !hasValidator || (validator.ok && score >= AUTO_MATCH_THRESHOLD) ? 'auto' : 'candidate'; + const evidence = hasValidator + ? { validator: { reasons: validator.reasons, signals: validator.signals } } + : {}; + if (status === 'candidate') { + logger.warn( + ` [${target.id}] downgraded to candidate score=${score.toFixed(2)} reasons=${validator?.reasons.join(',')}`, + ); + } await upsertProductMatch({ retailerProductId: productId, canonicalProductId: canonicalId, basketItemId, - matchScore: 1.0, - matchStatus: 'auto', + matchScore: score, + matchStatus: status, + evidence, }); } } catch (matchErr) { diff --git a/consumer-prices-core/vitest.config.ts b/consumer-prices-core/vitest.config.ts new file mode 100644 index 000000000..f60d8d5d9 --- /dev/null +++ b/consumer-prices-core/vitest.config.ts @@ -0,0 +1,8 @@ +import { defineConfig } from 'vitest/config'; + +export default defineConfig({ + test: { + include: ['src/**/*.test.ts'], + exclude: ['**/node_modules/**', '**/dist/**', 'src/adapters/search.smoke.ts'], + }, +});