mirror of
https://github.com/koala73/worldmonitor.git
synced 2026-04-25 17:14:57 +02:00
* fix(brief): unblock whyMatters analyst endpoint + add DIGEST_ONLY_USER filter Three changes, all operational for PR #3248's brief-why-matters feature. 1. middleware.ts PUBLIC_API_PATHS allowlist Railway logs post-#3248 merge showed every cron call to /api/internal/brief-why-matters returning 403 — middleware's "short UA" guard (~L183) rejects Node undici's default UA before the endpoint's own Bearer-auth runs. The feature never executed in prod; three-layer fallback silently shipped legacy Gemini output. Same class as /api/seed-contract-probe (2026-04-15). Endpoint still carries its own subtle-crypto HMAC auth, so bypassing the UA gate is safe. 2. Explicit UA on callAnalystWhyMatters fetch Defense-in-depth. Explicit 'worldmonitor-digest-notifications/1.0' keeps the endpoint reachable if PUBLIC_API_PATHS is ever refactored, and makes cron traffic distinguishable from ops curl in logs. 3. DIGEST_ONLY_USER=user_xxx filter Operator single-user test flag. Set on Railway to run compose + send for one user on the next tick (then unset) — validates new features end-to-end without fanning out. Empty/unset = normal fan-out. Applied right after rule fetch so both compose and dispatch paths respect it. Regression tests: 15 new cases in tests/middleware-bot-gate.test.mts pin every PUBLIC_API_PATHS entry against 3 triggers (empty/short/curl UA) plus a negative sibling-path suite so a future prefix-match refactor can't silently unblock /api/internal/. Tests: 6043 pass. typecheck + typecheck:api clean. biome: pre-existing main() complexity warning bumped 74→78 by the filter block (unchanged in character from pre-PR). * test(middleware): expand sibling-path negatives to cover all 3 trigger UAs Greptile flagged: `SIBLING_PATHS` was only tested with `EMPTY_UA`. Under the current middleware chain this is sufficient (sibling paths hit the short-UA OR BOT_UA 403 regardless), but it doesn't pin *which* guard fires. A future refactor that moves `PUBLIC_API_PATHS.has(path)` later in the chain could let a curl or undici UA pass on a sibling path without this suite failing. Fix: iterate the 3 sibling paths against all 3 trigger UAs (empty, short/undici, curl). Every combination must still 403 regardless of which guard catches it. 6 new test cases. Tests: 35 pass in the middleware-bot-gate suite (was 29).
198 lines
8.2 KiB
TypeScript
198 lines
8.2 KiB
TypeScript
// Regression tests for middleware.ts's bot-UA gate.
|
|
//
|
|
// Pins the contract around the `/api/brief/carousel/` carve-out
|
|
// shipped in PR #3196: social-platform image fetchers
|
|
// (Slack/Telegram/Discord/LinkedIn/etc.) must be able to download
|
|
// the carousel PNGs even though their UAs contain "bot" and thus
|
|
// match BOT_UA, while the generic bot gate must still 403 plain
|
|
// scrapers on every other API path.
|
|
//
|
|
// Without this test the allowlist is the kind of policy that
|
|
// silently regresses on future middleware edits — Telegram's
|
|
// sendMediaGroup failure mode ("WEBPAGE_CURL_FAILED") does not
|
|
// surface as a CI failure anywhere else.
|
|
|
|
import { describe, it } from 'node:test';
|
|
import assert from 'node:assert/strict';
|
|
|
|
import middleware from '../middleware';
|
|
|
|
const TELEGRAM_BOT_UA = 'TelegramBot (like TwitterBot)';
|
|
const SLACKBOT_UA = 'Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)';
|
|
const DISCORDBOT_UA = 'Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)';
|
|
const LINKEDINBOT_UA = 'LinkedInBot/1.0 (compatible; Mozilla/5.0; Apache-HttpClient +http://www.linkedin.com)';
|
|
const GENERIC_CURL_UA = 'curl/8.1.2';
|
|
const GENERIC_SCRAPER_UA = 'Mozilla/5.0 (compatible; SomeRandomBot/1.2)';
|
|
const CHROME_UA =
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ' +
|
|
'(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
|
|
|
|
// Slot format: YYYY-MM-DD-HHMM — per compose run, matches the
|
|
// carousel route's ISSUE_DATE_RE and the signer's slot regex.
|
|
const CAROUSEL_PATH = '/api/brief/carousel/user_abc/2026-04-19-0800/0';
|
|
// Bare YYYY-MM-DD (the pre-slot shape) must no longer match, so digest
|
|
// links that predate the slot rollout naturally fall into the bot gate
|
|
// instead of silently leaking the allowlist.
|
|
const LEGACY_DATE_ONLY_CAROUSEL_PATH = '/api/brief/carousel/user_abc/2026-04-19/0';
|
|
const OTHER_API_PATH = '/api/notifications';
|
|
const MALFORMED_CAROUSEL_PATH = '/api/brief/carousel/admin/dashboard';
|
|
|
|
function call(pathOrUrl: string, ua: string): Response | void {
|
|
const url = pathOrUrl.startsWith('http')
|
|
? pathOrUrl
|
|
: `https://www.worldmonitor.app${pathOrUrl}`;
|
|
const req = new Request(url, {
|
|
headers: ua ? { 'user-agent': ua } : {},
|
|
});
|
|
return middleware(req) as Response | void;
|
|
}
|
|
|
|
describe('middleware bot gate / carousel allowlist', () => {
|
|
it('passes TelegramBot through on the carousel route (the PR #3196 fix)', () => {
|
|
const res = call(CAROUSEL_PATH, TELEGRAM_BOT_UA);
|
|
assert.equal(res, undefined, 'Telegram must be able to fetch carousel images');
|
|
});
|
|
|
|
it('passes Slackbot through on the carousel route', () => {
|
|
const res = call(CAROUSEL_PATH, SLACKBOT_UA);
|
|
assert.equal(res, undefined);
|
|
});
|
|
|
|
it('passes Discordbot through on the carousel route', () => {
|
|
const res = call(CAROUSEL_PATH, DISCORDBOT_UA);
|
|
assert.equal(res, undefined);
|
|
});
|
|
|
|
it('passes LinkedInBot through on the carousel route', () => {
|
|
const res = call(CAROUSEL_PATH, LINKEDINBOT_UA);
|
|
assert.equal(res, undefined);
|
|
});
|
|
|
|
it('still 403s curl on the carousel route (bot gate protects from non-social UAs)', () => {
|
|
const res = call(CAROUSEL_PATH, GENERIC_CURL_UA);
|
|
assert.ok(res instanceof Response, 'should return a Response, not pass through');
|
|
assert.equal(res.status, 403);
|
|
});
|
|
|
|
it('still 403s a generic "bot" UA on the carousel route', () => {
|
|
const res = call(CAROUSEL_PATH, GENERIC_SCRAPER_UA);
|
|
assert.ok(res instanceof Response);
|
|
assert.equal(res.status, 403);
|
|
});
|
|
|
|
it('still 403s TelegramBot on non-carousel API routes (allowlist is scoped, not global)', () => {
|
|
const res = call(OTHER_API_PATH, TELEGRAM_BOT_UA);
|
|
assert.ok(res instanceof Response);
|
|
assert.equal(res.status, 403);
|
|
});
|
|
|
|
it('still 403s TelegramBot on malformed carousel paths (regex enforces route shape)', () => {
|
|
const res = call(MALFORMED_CAROUSEL_PATH, TELEGRAM_BOT_UA);
|
|
assert.ok(res instanceof Response);
|
|
assert.equal(res.status, 403);
|
|
});
|
|
|
|
it('still 403s missing UA on the carousel route (short-UA guard)', () => {
|
|
const res = call(CAROUSEL_PATH, '');
|
|
assert.ok(res instanceof Response);
|
|
assert.equal(res.status, 403);
|
|
});
|
|
|
|
it('passes normal browsers through on the carousel route', () => {
|
|
const res = call(CAROUSEL_PATH, CHROME_UA);
|
|
assert.equal(res, undefined);
|
|
});
|
|
|
|
it('passes normal browsers through on any API route', () => {
|
|
const res = call(OTHER_API_PATH, CHROME_UA);
|
|
assert.equal(res, undefined);
|
|
});
|
|
|
|
it('does not accept page 3+ on the carousel route (pageFromIndex only has 0/1/2)', () => {
|
|
const res = call('/api/brief/carousel/user_abc/2026-04-19-0800/3', TELEGRAM_BOT_UA);
|
|
assert.ok(res instanceof Response, 'out-of-range page must hit the bot gate');
|
|
assert.equal(res.status, 403);
|
|
});
|
|
|
|
it('does not accept non-slot segments on the carousel route', () => {
|
|
const res = call('/api/brief/carousel/user_abc/today/0', TELEGRAM_BOT_UA);
|
|
assert.ok(res instanceof Response);
|
|
assert.equal(res.status, 403);
|
|
});
|
|
|
|
it('does not accept the pre-slot YYYY-MM-DD shape (slot rollout parity)', () => {
|
|
// Once the composer moves to slot URLs, legacy date-only paths
|
|
// should NOT leak the social allowlist — they correspond to
|
|
// expired pre-rollout links whose Redis keys no longer exist.
|
|
const res = call(LEGACY_DATE_ONLY_CAROUSEL_PATH, TELEGRAM_BOT_UA);
|
|
assert.ok(res instanceof Response);
|
|
assert.equal(res.status, 403);
|
|
});
|
|
});
|
|
|
|
// ── PUBLIC_API_PATHS allowlist (secret-authed internal endpoints) ────────────
|
|
// The middleware's "no UA or suspiciously short" 403 guard (middleware.ts:
|
|
// ~L183) blocks Node/undici default-UA callers. Internal endpoints that carry
|
|
// their own Bearer-auth must be in PUBLIC_API_PATHS to bypass the gate.
|
|
//
|
|
// History:
|
|
// - /api/seed-contract-probe hit this 2026-04-15 (UptimeRobot + ops curl).
|
|
// - /api/internal/brief-why-matters hit this 2026-04-21 immediately after
|
|
// PR #3248 merge — every Railway cron call returned 403 and silently
|
|
// fell back to legacy Gemini. No functional breakage (3-layer fallback
|
|
// absorbed it) but the new feature never ran in prod.
|
|
//
|
|
// These tests pin the allowlist so a future middleware refactor (e.g. the
|
|
// BOT_UA regex being narrowed, or PUBLIC_API_PATHS being reorganized) can't
|
|
// silently drop an entry.
|
|
|
|
describe('middleware PUBLIC_API_PATHS — secret-authed internal endpoints bypass UA gate', () => {
|
|
// UAs that would normally 403 on any other API route.
|
|
const EMPTY_UA = '';
|
|
const UNDICI_UA = 'undici'; // Too short (<10 chars) — triggers short-UA 403.
|
|
const CURL_UA = GENERIC_CURL_UA; // Matches curl/ in BOT_UA regex.
|
|
|
|
const TRIGGERS = [
|
|
{ label: 'empty UA (middleware short-UA gate)', ua: EMPTY_UA },
|
|
{ label: 'short UA (Node undici default-ish)', ua: UNDICI_UA },
|
|
{ label: 'curl UA (BOT_UA regex hit)', ua: CURL_UA },
|
|
];
|
|
|
|
const ALLOWED_PATHS = [
|
|
'/api/version',
|
|
'/api/health',
|
|
'/api/seed-contract-probe',
|
|
'/api/internal/brief-why-matters',
|
|
];
|
|
|
|
for (const path of ALLOWED_PATHS) {
|
|
for (const { label, ua } of TRIGGERS) {
|
|
it(`${path} bypasses the UA gate (${label})`, () => {
|
|
const res = call(path, ua);
|
|
assert.equal(res, undefined, `${path} must pass through the middleware (no 403); its own auth gate handles access`);
|
|
});
|
|
}
|
|
}
|
|
|
|
// Negative case: a sibling path that is NOT in the allowlist must still 403
|
|
// under EACH of the 3 triggers. This catches a future refactor that moves
|
|
// the PUBLIC_API_PATHS check later in the chain (e.g. behind a broadened
|
|
// prefix-match) and might let one of the trigger UAs slip through on a
|
|
// sibling path without this suite failing. Pin all three guard paths.
|
|
const SIBLING_PATHS = [
|
|
'/api/internal/brief-why-matters-v2', // near-miss suffix
|
|
'/api/internal/', // directory only
|
|
'/api/internal/other', // different leaf
|
|
];
|
|
|
|
for (const path of SIBLING_PATHS) {
|
|
for (const { label, ua } of TRIGGERS) {
|
|
it(`${path} does NOT bypass the UA gate — ${label}`, () => {
|
|
const res = call(path, ua);
|
|
assert.ok(res instanceof Response, `${path} must still hit the 403 guard under ${label}`);
|
|
assert.equal(res.status, 403);
|
|
});
|
|
}
|
|
}
|
|
});
|