mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
* fix(brief): unblock whyMatters analyst endpoint + add DIGEST_ONLY_USER filter Three changes, all operational for PR #3248's brief-why-matters feature. 1. middleware.ts PUBLIC_API_PATHS allowlist Railway logs post-#3248 merge showed every cron call to /api/internal/brief-why-matters returning 403 — middleware's "short UA" guard (~L183) rejects Node undici's default UA before the endpoint's own Bearer-auth runs. The feature never executed in prod; three-layer fallback silently shipped legacy Gemini output. Same class as /api/seed-contract-probe (2026-04-15). Endpoint still carries its own subtle-crypto HMAC auth, so bypassing the UA gate is safe. 2. Explicit UA on callAnalystWhyMatters fetch Defense-in-depth. Explicit 'worldmonitor-digest-notifications/1.0' keeps the endpoint reachable if PUBLIC_API_PATHS is ever refactored, and makes cron traffic distinguishable from ops curl in logs. 3. DIGEST_ONLY_USER=user_xxx filter Operator single-user test flag. Set on Railway to run compose + send for one user on the next tick (then unset) — validates new features end-to-end without fanning out. Empty/unset = normal fan-out. Applied right after rule fetch so both compose and dispatch paths respect it. Regression tests: 15 new cases in tests/middleware-bot-gate.test.mts pin every PUBLIC_API_PATHS entry against 3 triggers (empty/short/curl UA) plus a negative sibling-path suite so a future prefix-match refactor can't silently unblock /api/internal/. Tests: 6043 pass. typecheck + typecheck:api clean. biome: pre-existing main() complexity warning bumped 74→78 by the filter block (unchanged in character from pre-PR). * test(middleware): expand sibling-path negatives to cover all 3 trigger UAs Greptile flagged: `SIBLING_PATHS` was only tested with `EMPTY_UA`. Under the current middleware chain this is sufficient (sibling paths hit the short-UA OR BOT_UA 403 regardless), but it doesn't pin *which* guard fires. A future refactor that moves `PUBLIC_API_PATHS.has(path)` later in the chain could let a curl or undici UA pass on a sibling path without this suite failing. Fix: iterate the 3 sibling paths against all 3 trigger UAs (empty, short/undici, curl). Every combination must still 403 regardless of which guard catches it. 6 new test cases. Tests: 35 pass in the middleware-bot-gate suite (was 29).
204 lines
8.3 KiB
TypeScript
204 lines
8.3 KiB
TypeScript
const BOT_UA =
|
|
/bot|crawl|spider|slurp|archiver|wget|curl\/|python-requests|scrapy|httpclient|go-http|java\/|libwww|perl|ruby|php\/|ahrefsbot|semrushbot|mj12bot|dotbot|baiduspider|yandexbot|sogou|bytespider|petalbot|gptbot|claudebot|ccbot/i;
|
|
|
|
const SOCIAL_PREVIEW_UA =
|
|
/twitterbot|facebookexternalhit|linkedinbot|slackbot|telegrambot|whatsapp|discordbot|redditbot/i;
|
|
|
|
const SOCIAL_PREVIEW_PATHS = new Set(['/api/story', '/api/og-story']);
|
|
|
|
// Paths that bypass bot/script UA filtering below. Each must carry its own
|
|
// auth (API key, shared secret, or intentionally-public semantics) because
|
|
// this list disables the middleware's generic bot gate.
|
|
// - /api/version, /api/health: intentionally public, monitoring-friendly.
|
|
// - /api/seed-contract-probe: requires RELAY_SHARED_SECRET header; called by
|
|
// UptimeRobot + ops curl. Was blocked by the curl/bot UA regex before this
|
|
// exception landed (Vercel log 2026-04-15: "Middleware 403 Forbidden" on
|
|
// /api/seed-contract-probe).
|
|
// - /api/internal/brief-why-matters: requires RELAY_SHARED_SECRET Bearer
|
|
// (subtle-crypto HMAC timing-safe compare in server/_shared/internal-auth.ts).
|
|
// Called from the Railway digest-notifications cron whose fetch() uses the
|
|
// Node undici default UA, which is short enough to trip the "no UA or
|
|
// suspiciously short" 403 below (Railway log 2026-04-21 post-#3248 merge:
|
|
// every cron call returned 403 and silently fell back to legacy Gemini).
|
|
const PUBLIC_API_PATHS = new Set([
|
|
'/api/version',
|
|
'/api/health',
|
|
'/api/seed-contract-probe',
|
|
'/api/internal/brief-why-matters',
|
|
]);
|
|
|
|
const SOCIAL_IMAGE_UA =
|
|
/Slack-ImgProxy|Slackbot|twitterbot|facebookexternalhit|linkedinbot|telegrambot|whatsapp|discordbot|redditbot/i;
|
|
|
|
// Must match the exact route shape enforced by
|
|
// api/brief/carousel/[userId]/[issueDate]/[page].ts:
|
|
// /api/brief/carousel/<userId>/YYYY-MM-DD-HHMM/<0|1|2>
|
|
// The issueDate segment is a per-run slot (date + HHMM in the user's
|
|
// tz) so same-day digests produce distinct carousel URLs.
|
|
// pageFromIndex() in brief-carousel-render.ts accepts only 0/1/2, so
|
|
// the trailing segment is tightly bounded.
|
|
const BRIEF_CAROUSEL_PATH_RE =
|
|
/^\/api\/brief\/carousel\/[^/]+\/\d{4}-\d{2}-\d{2}-\d{4}\/[0-2]\/?$/;
|
|
|
|
const VARIANT_HOST_MAP: Record<string, string> = {
|
|
'tech.worldmonitor.app': 'tech',
|
|
'finance.worldmonitor.app': 'finance',
|
|
'commodity.worldmonitor.app': 'commodity',
|
|
'happy.worldmonitor.app': 'happy',
|
|
};
|
|
|
|
// Source of truth: src/config/variant-meta.ts — keep in sync when variant metadata changes.
|
|
const VARIANT_OG: Record<string, { title: string; description: string; image: string; url: string }> = {
|
|
tech: {
|
|
title: 'Tech Monitor - Real-Time AI & Tech Industry Dashboard',
|
|
description: 'Real-time AI and tech industry dashboard tracking tech giants, AI labs, startup ecosystems, funding rounds, and tech events worldwide.',
|
|
image: 'https://tech.worldmonitor.app/favico/tech/og-image.png',
|
|
url: 'https://tech.worldmonitor.app/',
|
|
},
|
|
finance: {
|
|
title: 'Finance Monitor - Real-Time Markets & Trading Dashboard',
|
|
description: 'Real-time finance and trading dashboard tracking global markets, stock exchanges, central banks, commodities, forex, crypto, and economic indicators worldwide.',
|
|
image: 'https://finance.worldmonitor.app/favico/finance/og-image.png',
|
|
url: 'https://finance.worldmonitor.app/',
|
|
},
|
|
commodity: {
|
|
title: 'Commodity Monitor - Real-Time Commodity Markets & Supply Chain Dashboard',
|
|
description: 'Real-time commodity markets dashboard tracking mining sites, processing plants, commodity ports, supply chains, and global commodity trade flows.',
|
|
image: 'https://commodity.worldmonitor.app/favico/commodity/og-image.png',
|
|
url: 'https://commodity.worldmonitor.app/',
|
|
},
|
|
happy: {
|
|
title: 'Happy Monitor - Good News & Global Progress',
|
|
description: 'Curated positive news, progress data, and uplifting stories from around the world.',
|
|
image: 'https://happy.worldmonitor.app/favico/happy/og-image.png',
|
|
url: 'https://happy.worldmonitor.app/',
|
|
},
|
|
};
|
|
|
|
const ALLOWED_HOSTS = new Set([
|
|
'worldmonitor.app',
|
|
...Object.keys(VARIANT_HOST_MAP),
|
|
]);
|
|
const VERCEL_PREVIEW_RE = /^[a-z0-9-]+-[a-z0-9]{8,}\.vercel\.app$/;
|
|
|
|
function normalizeHost(raw: string): string {
|
|
return raw.toLowerCase().replace(/:\d+$/, '');
|
|
}
|
|
|
|
function isAllowedHost(host: string): boolean {
|
|
return ALLOWED_HOSTS.has(host) || VERCEL_PREVIEW_RE.test(host);
|
|
}
|
|
|
|
export default function middleware(request: Request) {
|
|
const url = new URL(request.url);
|
|
const ua = request.headers.get('user-agent') ?? '';
|
|
const path = url.pathname;
|
|
const host = normalizeHost(request.headers.get('host') ?? url.hostname);
|
|
|
|
// Social bot OG response for variant subdomain root pages
|
|
if (path === '/' && SOCIAL_PREVIEW_UA.test(ua)) {
|
|
const variant = VARIANT_HOST_MAP[host];
|
|
if (variant && isAllowedHost(host)) {
|
|
const og = VARIANT_OG[variant as keyof typeof VARIANT_OG];
|
|
if (og) {
|
|
const html = `<!DOCTYPE html><html><head>
|
|
<meta property="og:type" content="website"/>
|
|
<meta property="og:title" content="${og.title}"/>
|
|
<meta property="og:description" content="${og.description}"/>
|
|
<meta property="og:image" content="${og.image}"/>
|
|
<meta property="og:url" content="${og.url}"/>
|
|
<meta name="twitter:card" content="summary_large_image"/>
|
|
<meta name="twitter:title" content="${og.title}"/>
|
|
<meta name="twitter:description" content="${og.description}"/>
|
|
<meta name="twitter:image" content="${og.image}"/>
|
|
<title>${og.title}</title>
|
|
</head><body></body></html>`;
|
|
return new Response(html, {
|
|
status: 200,
|
|
headers: {
|
|
'Content-Type': 'text/html; charset=utf-8',
|
|
'Cache-Control': 'no-store',
|
|
'Vary': 'User-Agent, Host',
|
|
},
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Only apply bot filtering to /api/* and /favico/* paths
|
|
if (!path.startsWith('/api/') && !path.startsWith('/favico/')) {
|
|
return;
|
|
}
|
|
|
|
// Allow social preview/image bots on OG image assets.
|
|
//
|
|
// Image-returning API routes that don't end in `.png` also need
|
|
// an explicit carve-out — otherwise server-side fetches from
|
|
// Slack / Telegram / Discord / LinkedIn / WhatsApp / Facebook /
|
|
// Twitter / Reddit all trip the BOT_UA gate below. Telegram
|
|
// surfaces it as error 400 "WEBPAGE_CURL_FAILED" on sendMediaGroup;
|
|
// the others silently drop the preview image.
|
|
//
|
|
// Only the brief carousel route shape is allowlisted — a strict
|
|
// regex (same shape enforced by the handler) prevents a future
|
|
// /api/brief/carousel/admin or similar sibling from accidentally
|
|
// inheriting this bypass. HMAC token in the URL is the real auth;
|
|
// this allowlist is defence-in-depth for any well-shaped request
|
|
// whose UA happens to be in SOCIAL_IMAGE_UA.
|
|
if (
|
|
path.startsWith('/favico/') ||
|
|
path.endsWith('.png') ||
|
|
BRIEF_CAROUSEL_PATH_RE.test(path)
|
|
) {
|
|
if (SOCIAL_IMAGE_UA.test(ua)) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Allow social preview bots on exact OG routes only
|
|
if (SOCIAL_PREVIEW_UA.test(ua) && SOCIAL_PREVIEW_PATHS.has(path)) {
|
|
return;
|
|
}
|
|
|
|
// Public endpoints bypass all bot filtering
|
|
if (PUBLIC_API_PATHS.has(path)) {
|
|
return;
|
|
}
|
|
|
|
// Authenticated Pro API clients bypass UA filtering. This is a cheap
|
|
// edge heuristic, not auth — real validation (SHA-256 hash vs Convex
|
|
// userApiKeys + entitlement) happens in server/gateway.ts. To keep the
|
|
// bot-UA shield meaningful, require the exact key shape emitted by
|
|
// src/services/api-keys.ts:generateKey: `wm_` + 40 lowercase hex chars.
|
|
// A random scraper would have to guess a specific 43-char format, and
|
|
// spoofed-but-well-shaped keys still 401 at the gateway.
|
|
const WM_KEY_SHAPE = /^wm_[a-f0-9]{40}$/;
|
|
const apiKey =
|
|
request.headers.get('x-worldmonitor-key') ??
|
|
request.headers.get('x-api-key') ??
|
|
'';
|
|
if (WM_KEY_SHAPE.test(apiKey)) {
|
|
return;
|
|
}
|
|
|
|
// Block bots from all API routes
|
|
if (BOT_UA.test(ua)) {
|
|
return new Response('{"error":"Forbidden"}', {
|
|
status: 403,
|
|
headers: { 'Content-Type': 'application/json' },
|
|
});
|
|
}
|
|
|
|
// No user-agent or suspiciously short — likely a script
|
|
if (!ua || ua.length < 10) {
|
|
return new Response('{"error":"Forbidden"}', {
|
|
status: 403,
|
|
headers: { 'Content-Type': 'application/json' },
|
|
});
|
|
}
|
|
}
|
|
|
|
export const config = {
|
|
matcher: ['/', '/api/:path*', '/favico/:path*'],
|
|
};
|