mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
* fix(brief): per-run slot URL so same-day digests link to distinct briefs
Digest emails at 8am and 1pm on the same day pointed to byte-identical
magazine URLs because the URL was keyed on YYYY-MM-DD in the user tz.
Each compose run overwrote the single daily envelope in place, and the
composer rolling 24h story window meant afternoon output often looked
identical to morning. Readers clicking an older email got whatever the
latest cron happened to write.
Slot format is now YYYY-MM-DD-HHMM (local tz, per compose run). The
magazine URL, carousel URLs, and Redis key all carry the slot, and each
digest dispatch gets its own frozen envelope that lives out the 7d TTL.
envelope.data.date stays YYYY-MM-DD for rendering "19 April 2026".
The digest cron also writes a brief:latest:{userId} pointer (7d TTL,
overwritten each compose) so the dashboard panel and share-url endpoint
can locate the most recent brief without knowing the slot. The
previous date-probing strategy does not work once keys carry HHMM.
No back-compat for the old YYYY-MM-DD format: the verifier rejects it,
the composer only ever writes the new shape, and any in-flight
notifications signed under the old format will 403 on click. Acceptable
at the rollout boundary per product decision.
* fix(brief): carve middleware bot allowlist to accept slot-format carousel path
BRIEF_CAROUSEL_PATH_RE in middleware.ts was still matching only the
pre-slot YYYY-MM-DD segment, so every slot-based carousel URL emitted
by the digest cron (YYYY-MM-DD-HHMM) would miss the social allowlist
and fall into the generic bot gate. Telegram/Slack/Discord/LinkedIn
image fetchers would 403 on sendMediaGroup, breaking previews for the
new digest links.
CI missed this because tests/middleware-bot-gate.test.mts still
exercised the old /YYYY-MM-DD/ path shape. Swap the fixture to the
slot format and add a regression asserting the pre-slot shape is now
rejected, so legacy links cannot silently leak the allowlist after
the rollout.
* fix(brief): preserve caller-requested slot + correct no-brief share-url error
Two contract bugs in the slot rollout that silently misled callers:
1. GET /api/latest-brief?slot=X where X has no envelope was returning
{ status: 'composing', issueDate: <today UTC> } — which reads as
"today's brief is composing" instead of "the specific slot you
asked about doesn't exist". A caller probing a known historical
slot would get a completely unrelated "today" signal. Now we echo
the requested slot back (issueSlot + issueDate derived from its
date portion) when the caller supplied ?slot=, and keep the
UTC-today placeholder only for the no-param path.
2. POST /api/brief/share-url with no slot and no latest-pointer was
falling into the generic invalid_slot_shape 400 branch. That is
not an input-shape problem; it is "no brief exists yet for this
user". Return 404 brief_not_found — the same code the
existing-envelope check returns — so callers get one coherent
contract: either the brief exists and is shareable, or it doesn't
and you get 404.
193 lines
7.8 KiB
TypeScript
193 lines
7.8 KiB
TypeScript
const BOT_UA =
|
|
/bot|crawl|spider|slurp|archiver|wget|curl\/|python-requests|scrapy|httpclient|go-http|java\/|libwww|perl|ruby|php\/|ahrefsbot|semrushbot|mj12bot|dotbot|baiduspider|yandexbot|sogou|bytespider|petalbot|gptbot|claudebot|ccbot/i;
|
|
|
|
const SOCIAL_PREVIEW_UA =
|
|
/twitterbot|facebookexternalhit|linkedinbot|slackbot|telegrambot|whatsapp|discordbot|redditbot/i;
|
|
|
|
const SOCIAL_PREVIEW_PATHS = new Set(['/api/story', '/api/og-story']);
|
|
|
|
// Paths that bypass bot/script UA filtering below. Each must carry its own
|
|
// auth (API key, shared secret, or intentionally-public semantics) because
|
|
// this list disables the middleware's generic bot gate.
|
|
// - /api/version, /api/health: intentionally public, monitoring-friendly.
|
|
// - /api/seed-contract-probe: requires RELAY_SHARED_SECRET header; called by
|
|
// UptimeRobot + ops curl. Was blocked by the curl/bot UA regex before this
|
|
// exception landed (Vercel log 2026-04-15: "Middleware 403 Forbidden" on
|
|
// /api/seed-contract-probe).
|
|
const PUBLIC_API_PATHS = new Set(['/api/version', '/api/health', '/api/seed-contract-probe']);
|
|
|
|
const SOCIAL_IMAGE_UA =
|
|
/Slack-ImgProxy|Slackbot|twitterbot|facebookexternalhit|linkedinbot|telegrambot|whatsapp|discordbot|redditbot/i;
|
|
|
|
// Must match the exact route shape enforced by
|
|
// api/brief/carousel/[userId]/[issueDate]/[page].ts:
|
|
// /api/brief/carousel/<userId>/YYYY-MM-DD-HHMM/<0|1|2>
|
|
// The issueDate segment is a per-run slot (date + HHMM in the user's
|
|
// tz) so same-day digests produce distinct carousel URLs.
|
|
// pageFromIndex() in brief-carousel-render.ts accepts only 0/1/2, so
|
|
// the trailing segment is tightly bounded.
|
|
const BRIEF_CAROUSEL_PATH_RE =
|
|
/^\/api\/brief\/carousel\/[^/]+\/\d{4}-\d{2}-\d{2}-\d{4}\/[0-2]\/?$/;
|
|
|
|
const VARIANT_HOST_MAP: Record<string, string> = {
|
|
'tech.worldmonitor.app': 'tech',
|
|
'finance.worldmonitor.app': 'finance',
|
|
'commodity.worldmonitor.app': 'commodity',
|
|
'happy.worldmonitor.app': 'happy',
|
|
};
|
|
|
|
// Source of truth: src/config/variant-meta.ts — keep in sync when variant metadata changes.
|
|
const VARIANT_OG: Record<string, { title: string; description: string; image: string; url: string }> = {
|
|
tech: {
|
|
title: 'Tech Monitor - Real-Time AI & Tech Industry Dashboard',
|
|
description: 'Real-time AI and tech industry dashboard tracking tech giants, AI labs, startup ecosystems, funding rounds, and tech events worldwide.',
|
|
image: 'https://tech.worldmonitor.app/favico/tech/og-image.png',
|
|
url: 'https://tech.worldmonitor.app/',
|
|
},
|
|
finance: {
|
|
title: 'Finance Monitor - Real-Time Markets & Trading Dashboard',
|
|
description: 'Real-time finance and trading dashboard tracking global markets, stock exchanges, central banks, commodities, forex, crypto, and economic indicators worldwide.',
|
|
image: 'https://finance.worldmonitor.app/favico/finance/og-image.png',
|
|
url: 'https://finance.worldmonitor.app/',
|
|
},
|
|
commodity: {
|
|
title: 'Commodity Monitor - Real-Time Commodity Markets & Supply Chain Dashboard',
|
|
description: 'Real-time commodity markets dashboard tracking mining sites, processing plants, commodity ports, supply chains, and global commodity trade flows.',
|
|
image: 'https://commodity.worldmonitor.app/favico/commodity/og-image.png',
|
|
url: 'https://commodity.worldmonitor.app/',
|
|
},
|
|
happy: {
|
|
title: 'Happy Monitor - Good News & Global Progress',
|
|
description: 'Curated positive news, progress data, and uplifting stories from around the world.',
|
|
image: 'https://happy.worldmonitor.app/favico/happy/og-image.png',
|
|
url: 'https://happy.worldmonitor.app/',
|
|
},
|
|
};
|
|
|
|
const ALLOWED_HOSTS = new Set([
|
|
'worldmonitor.app',
|
|
...Object.keys(VARIANT_HOST_MAP),
|
|
]);
|
|
const VERCEL_PREVIEW_RE = /^[a-z0-9-]+-[a-z0-9]{8,}\.vercel\.app$/;
|
|
|
|
function normalizeHost(raw: string): string {
|
|
return raw.toLowerCase().replace(/:\d+$/, '');
|
|
}
|
|
|
|
function isAllowedHost(host: string): boolean {
|
|
return ALLOWED_HOSTS.has(host) || VERCEL_PREVIEW_RE.test(host);
|
|
}
|
|
|
|
export default function middleware(request: Request) {
|
|
const url = new URL(request.url);
|
|
const ua = request.headers.get('user-agent') ?? '';
|
|
const path = url.pathname;
|
|
const host = normalizeHost(request.headers.get('host') ?? url.hostname);
|
|
|
|
// Social bot OG response for variant subdomain root pages
|
|
if (path === '/' && SOCIAL_PREVIEW_UA.test(ua)) {
|
|
const variant = VARIANT_HOST_MAP[host];
|
|
if (variant && isAllowedHost(host)) {
|
|
const og = VARIANT_OG[variant as keyof typeof VARIANT_OG];
|
|
if (og) {
|
|
const html = `<!DOCTYPE html><html><head>
|
|
<meta property="og:type" content="website"/>
|
|
<meta property="og:title" content="${og.title}"/>
|
|
<meta property="og:description" content="${og.description}"/>
|
|
<meta property="og:image" content="${og.image}"/>
|
|
<meta property="og:url" content="${og.url}"/>
|
|
<meta name="twitter:card" content="summary_large_image"/>
|
|
<meta name="twitter:title" content="${og.title}"/>
|
|
<meta name="twitter:description" content="${og.description}"/>
|
|
<meta name="twitter:image" content="${og.image}"/>
|
|
<title>${og.title}</title>
|
|
</head><body></body></html>`;
|
|
return new Response(html, {
|
|
status: 200,
|
|
headers: {
|
|
'Content-Type': 'text/html; charset=utf-8',
|
|
'Cache-Control': 'no-store',
|
|
'Vary': 'User-Agent, Host',
|
|
},
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Only apply bot filtering to /api/* and /favico/* paths
|
|
if (!path.startsWith('/api/') && !path.startsWith('/favico/')) {
|
|
return;
|
|
}
|
|
|
|
// Allow social preview/image bots on OG image assets.
|
|
//
|
|
// Image-returning API routes that don't end in `.png` also need
|
|
// an explicit carve-out — otherwise server-side fetches from
|
|
// Slack / Telegram / Discord / LinkedIn / WhatsApp / Facebook /
|
|
// Twitter / Reddit all trip the BOT_UA gate below. Telegram
|
|
// surfaces it as error 400 "WEBPAGE_CURL_FAILED" on sendMediaGroup;
|
|
// the others silently drop the preview image.
|
|
//
|
|
// Only the brief carousel route shape is allowlisted — a strict
|
|
// regex (same shape enforced by the handler) prevents a future
|
|
// /api/brief/carousel/admin or similar sibling from accidentally
|
|
// inheriting this bypass. HMAC token in the URL is the real auth;
|
|
// this allowlist is defence-in-depth for any well-shaped request
|
|
// whose UA happens to be in SOCIAL_IMAGE_UA.
|
|
if (
|
|
path.startsWith('/favico/') ||
|
|
path.endsWith('.png') ||
|
|
BRIEF_CAROUSEL_PATH_RE.test(path)
|
|
) {
|
|
if (SOCIAL_IMAGE_UA.test(ua)) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
// Allow social preview bots on exact OG routes only
|
|
if (SOCIAL_PREVIEW_UA.test(ua) && SOCIAL_PREVIEW_PATHS.has(path)) {
|
|
return;
|
|
}
|
|
|
|
// Public endpoints bypass all bot filtering
|
|
if (PUBLIC_API_PATHS.has(path)) {
|
|
return;
|
|
}
|
|
|
|
// Authenticated Pro API clients bypass UA filtering. This is a cheap
|
|
// edge heuristic, not auth — real validation (SHA-256 hash vs Convex
|
|
// userApiKeys + entitlement) happens in server/gateway.ts. To keep the
|
|
// bot-UA shield meaningful, require the exact key shape emitted by
|
|
// src/services/api-keys.ts:generateKey: `wm_` + 40 lowercase hex chars.
|
|
// A random scraper would have to guess a specific 43-char format, and
|
|
// spoofed-but-well-shaped keys still 401 at the gateway.
|
|
const WM_KEY_SHAPE = /^wm_[a-f0-9]{40}$/;
|
|
const apiKey =
|
|
request.headers.get('x-worldmonitor-key') ??
|
|
request.headers.get('x-api-key') ??
|
|
'';
|
|
if (WM_KEY_SHAPE.test(apiKey)) {
|
|
return;
|
|
}
|
|
|
|
// Block bots from all API routes
|
|
if (BOT_UA.test(ua)) {
|
|
return new Response('{"error":"Forbidden"}', {
|
|
status: 403,
|
|
headers: { 'Content-Type': 'application/json' },
|
|
});
|
|
}
|
|
|
|
// No user-agent or suspiciously short — likely a script
|
|
if (!ua || ua.length < 10) {
|
|
return new Response('{"error":"Forbidden"}', {
|
|
status: 403,
|
|
headers: { 'Content-Type': 'application/json' },
|
|
});
|
|
}
|
|
}
|
|
|
|
export const config = {
|
|
matcher: ['/', '/api/:path*', '/favico/:path*'],
|
|
};
|