#!/usr/bin/env npx tsx import { execFile } from "node:child_process"; import { promises as fs } from "node:fs"; import path from "node:path"; import { promisify } from "node:util"; const execFileAsync = promisify(execFile); const DEFAULT_QUERY = "\"Co-Authored-By: Paperclip \""; const DEFAULT_CACHE_FILE = path.resolve("data/paperclip-commit-metrics-cache.json"); const DEFAULT_SEARCH_START = "2008-01-01T00:00:00Z"; const SEARCH_WINDOW_LIMIT = 900; const MIN_WINDOW_MS = 60_000; const DEFAULT_STATS_FETCH_LIMIT = 250; const DEFAULT_STATS_CONCURRENCY = 4; const DEFAULT_SEARCH_FIELD = "committer-date"; const PAPERCLIP_EMAIL = "noreply@paperclip.ing"; const PAPERCLIP_NAME = "paperclip"; interface CliOptions { cacheFile: string; end: Date; excludeOwners: string[]; exportFormat: "csv" | "json"; includePrivate: boolean; json: boolean; output: string | null; query: string; refreshSearch: boolean; refreshStats: boolean; searchField: "author-date" | "committer-date"; start: Date; statsConcurrency: number; statsFetchLimit: number; skipStats: boolean; } interface SearchCommitItem { author: { login?: string; } | null; commit: { author: { date: string; email: string | null; name: string | null; } | null; message: string; }; html_url: string; repository: { full_name: string; html_url: string; }; sha: string; } interface CommitStats { additions: number; deletions: number; total: number; } interface CachedCommit { authorEmail: string | null; authorLogin: string | null; authorName: string | null; committedAt: string | null; contributors: ContributorRecord[]; htmlUrl: string; repositoryFullName: string; repositoryUrl: string; sha: string; } interface CachedCommitStats extends CommitStats { fetchedAt: string; } interface ContributorRecord { displayName: string; email: string | null; key: string; login: string | null; } interface WindowCacheEntry { completedAt: string; key: string; shas: string[]; totalCount: number; } interface CacheFile { commits: Record; queryKey: string; searchField: CliOptions["searchField"]; stats: Record; updatedAt: string | null; version: number; windows: Record; } interface SearchResponse { incomplete_results: boolean; items: SearchCommitItem[]; total_count: number; } interface SearchWindowResult { shas: Set; totalCount: number; } interface Summary { cacheFile: string; contributors: { count: number; sample: ContributorRecord[]; }; detectedQuery: string; lineStats: { additions: number; complete: boolean; coveredCommits: number; deletions: number; missingCommits: number; totalChanges: number; }; range: { end: string; searchField: CliOptions["searchField"]; start: string; }; filters: { excludedOwners: string[]; }; repos: { count: number; sample: string[]; }; statsFetch: { fetchedThisRun: number; skipped: boolean; }; totals: { commits: number; }; } async function main() { const options = parseArgs(process.argv.slice(2)); const cache = await loadCache(options.cacheFile, options); const client = new GitHubClient(await resolveGitHubToken()); const { shas } = await searchWindow(client, cache, options, options.start, options.end); const sortedShas = [...shas].sort(); let fetchedThisRun = 0; if (!options.skipStats) { fetchedThisRun = await enrichCommitStats(client, cache, options, sortedShas); } cache.updatedAt = new Date().toISOString(); await saveCache(options.cacheFile, cache); const filteredShas = sortFilteredShas(cache, filterShas(cache, sortedShas, options)); const summary = buildSummary(cache, options, filteredShas, fetchedThisRun); if (options.output) { await writeExport(options.output, options.exportFormat, cache, filteredShas, summary); } if (options.json) { console.log(JSON.stringify(summary, null, 2)); return; } printSummary(summary); } function parseArgs(argv: string[]): CliOptions { const options: CliOptions = { cacheFile: DEFAULT_CACHE_FILE, end: new Date(), excludeOwners: [], exportFormat: "csv", includePrivate: false, json: false, output: null, query: DEFAULT_QUERY, refreshSearch: false, refreshStats: false, searchField: DEFAULT_SEARCH_FIELD, start: new Date(DEFAULT_SEARCH_START), statsConcurrency: DEFAULT_STATS_CONCURRENCY, statsFetchLimit: DEFAULT_STATS_FETCH_LIMIT, skipStats: false, }; for (let index = 0; index < argv.length; index += 1) { const arg = argv[index]; switch (arg) { case "--cache-file": options.cacheFile = requireValue(argv, ++index, arg); break; case "--end": options.end = parseDateArg(requireValue(argv, ++index, arg), arg); break; case "--exclude-owner": options.excludeOwners.push(requireValue(argv, ++index, arg).toLowerCase()); break; case "--export-format": { const value = requireValue(argv, ++index, arg); if (value !== "csv" && value !== "json") { throw new Error(`Invalid --export-format value: ${value}`); } options.exportFormat = value; break; } case "--include-private": options.includePrivate = true; break; case "--json": options.json = true; break; case "--output": options.output = requireValue(argv, ++index, arg); break; case "--query": options.query = requireValue(argv, ++index, arg); break; case "--refresh-search": options.refreshSearch = true; break; case "--refresh-stats": options.refreshStats = true; break; case "--search-field": { const value = requireValue(argv, ++index, arg); if (value !== "author-date" && value !== "committer-date") { throw new Error(`Invalid --search-field value: ${value}`); } options.searchField = value; break; } case "--skip-stats": options.skipStats = true; break; case "--start": options.start = parseDateArg(requireValue(argv, ++index, arg), arg); break; case "--stats-concurrency": options.statsConcurrency = parsePositiveInt(requireValue(argv, ++index, arg), arg); break; case "--stats-fetch-limit": options.statsFetchLimit = parseNonNegativeInt(requireValue(argv, ++index, arg), arg); break; case "--help": printHelp(); process.exit(0); break; default: throw new Error(`Unknown argument: ${arg}`); } } if (Number.isNaN(options.start.getTime()) || Number.isNaN(options.end.getTime())) { throw new Error("Invalid start or end date"); } if (options.start >= options.end) { throw new Error("--start must be earlier than --end"); } return options; } function requireValue(argv: string[], index: number, flag: string): string { const value = argv[index]; if (!value) { throw new Error(`Missing value for ${flag}`); } return value; } function parseDateArg(value: string, flag: string): Date { const parsed = new Date(value); if (Number.isNaN(parsed.getTime())) { throw new Error(`Invalid date for ${flag}: ${value}`); } return parsed; } function parsePositiveInt(value: string, flag: string): number { const parsed = Number.parseInt(value, 10); if (!Number.isFinite(parsed) || parsed <= 0) { throw new Error(`Invalid positive integer for ${flag}: ${value}`); } return parsed; } function parseNonNegativeInt(value: string, flag: string): number { const parsed = Number.parseInt(value, 10); if (!Number.isFinite(parsed) || parsed < 0) { throw new Error(`Invalid non-negative integer for ${flag}: ${value}`); } return parsed; } function printHelp() { console.log(`Usage: tsx scripts/paperclip-commit-metrics.ts [options] Options: --start ISO date/time lower bound (default: ${DEFAULT_SEARCH_START}) --end ISO date/time upper bound (default: now) --query Commit search string (default: ${DEFAULT_QUERY}) --search-field author-date | committer-date (default: ${DEFAULT_SEARCH_FIELD}) --include-private Include repos visible to the current token --exclude-owner Exclude repositories owned by this GitHub owner/org (repeatable) --cache-file Cache path (default: ${DEFAULT_CACHE_FILE}) --skip-stats Skip additions/deletions enrichment --stats-fetch-limit Max uncached commit stats to fetch this run (default: ${DEFAULT_STATS_FETCH_LIMIT}) --stats-concurrency Parallel commit stat requests (default: ${DEFAULT_STATS_CONCURRENCY}) --output Write the full filtered result set to a file --export-format csv | json for --output exports (default: csv) --refresh-search Ignore cached search windows --refresh-stats Re-fetch cached commit stats --json Print JSON summary --help Show this help `); } async function resolveGitHubToken(): Promise { const envToken = process.env.GITHUB_TOKEN ?? process.env.GH_TOKEN; if (envToken) { return envToken; } const { stdout } = await execFileAsync("gh", ["auth", "token"]); const token = stdout.trim(); if (!token) { throw new Error("Unable to resolve a GitHub token. Set GITHUB_TOKEN/GH_TOKEN or run `gh auth login`."); } return token; } async function loadCache(cacheFile: string, options: CliOptions): Promise { try { const raw = await fs.readFile(cacheFile, "utf8"); const parsed = JSON.parse(raw) as CacheFile; if (parsed.version !== 1 || parsed.queryKey !== buildQueryKey(options) || parsed.searchField !== options.searchField) { return createEmptyCache(options); } return parsed; } catch (error) { if ((error as NodeJS.ErrnoException).code === "ENOENT") { return createEmptyCache(options); } throw error; } } function createEmptyCache(options: CliOptions): CacheFile { return { commits: {}, queryKey: buildQueryKey(options), searchField: options.searchField, stats: {}, updatedAt: null, version: 1, windows: {}, }; } function buildQueryKey(options: CliOptions): string { const visibility = options.includePrivate ? "all" : "public"; return JSON.stringify({ query: options.query, searchField: options.searchField, visibility, }); } async function saveCache(cacheFile: string, cache: CacheFile): Promise { await fs.mkdir(path.dirname(cacheFile), { recursive: true }); await fs.writeFile(cacheFile, JSON.stringify(cache, null, 2), "utf8"); } async function searchWindow( client: GitHubClient, cache: CacheFile, options: CliOptions, start: Date, end: Date, ): Promise { const windowKey = makeWindowKey(start, end); if (!options.refreshSearch) { const cached = cache.windows[windowKey]; if (cached) { return { shas: new Set(cached.shas), totalCount: cached.totalCount }; } } const firstPage = await searchPage(client, options, start, end, 1, 100); if (firstPage.incomplete_results) { throw new Error(`GitHub returned incomplete search results for window ${windowKey}`); } if (firstPage.total_count > SEARCH_WINDOW_LIMIT) { const durationMs = end.getTime() - start.getTime(); if (durationMs <= MIN_WINDOW_MS) { throw new Error( `Search window ${windowKey} still has ${firstPage.total_count} results after splitting to ${durationMs}ms.`, ); } const midpoint = new Date(start.getTime() + Math.floor(durationMs / 2)); const left = await searchWindow(client, cache, options, start, midpoint); const right = await searchWindow(client, cache, options, new Date(midpoint.getTime() + 1), end); const shas = new Set([...left.shas, ...right.shas]); cache.windows[windowKey] = { completedAt: new Date().toISOString(), key: windowKey, shas: [...shas], totalCount: shas.size, }; return { shas, totalCount: shas.size }; } const pageCount = Math.ceil(firstPage.total_count / 100); const shas = new Set(); ingestSearchItems(cache, firstPage.items, shas); for (let page = 2; page <= pageCount; page += 1) { const response = await searchPage(client, options, start, end, page, 100); if (response.incomplete_results) { throw new Error(`GitHub returned incomplete search results for window ${windowKey} on page ${page}`); } ingestSearchItems(cache, response.items, shas); } cache.windows[windowKey] = { completedAt: new Date().toISOString(), key: windowKey, shas: [...shas], totalCount: firstPage.total_count, }; return { shas, totalCount: firstPage.total_count }; } async function searchPage( client: GitHubClient, options: CliOptions, start: Date, end: Date, page: number, perPage: number, ): Promise { const searchQuery = buildSearchQuery(options, start, end); const params = new URLSearchParams({ page: String(page), per_page: String(perPage), q: searchQuery, }); return client.getJson(`/search/commits?${params.toString()}`); } function buildSearchQuery(options: CliOptions, start: Date, end: Date): string { const qualifiers = [`${options.searchField}:${formatQueryDate(start)}..${formatQueryDate(end)}`]; if (!options.includePrivate) { qualifiers.push("is:public"); } return `${options.query} ${qualifiers.join(" ")}`.trim(); } function filterShas(cache: CacheFile, shas: string[], options: CliOptions): string[] { if (options.excludeOwners.length === 0) { return shas; } const excludedOwners = new Set(options.excludeOwners); return shas.filter((sha) => { const commit = cache.commits[sha]; if (!commit) { return false; } return !excludedOwners.has(getRepoOwner(commit.repositoryFullName)); }); } function sortFilteredShas(cache: CacheFile, shas: string[]): string[] { return [...shas].sort((leftSha, rightSha) => { const left = cache.commits[leftSha]; const right = cache.commits[rightSha]; const leftTime = left?.committedAt ? Date.parse(left.committedAt) : 0; const rightTime = right?.committedAt ? Date.parse(right.committedAt) : 0; if (rightTime !== leftTime) { return rightTime - leftTime; } const repoCompare = (left?.repositoryFullName ?? "").localeCompare(right?.repositoryFullName ?? ""); if (repoCompare !== 0) { return repoCompare; } return leftSha.localeCompare(rightSha); }); } function formatQueryDate(value: Date): string { return new Date(Math.floor(value.getTime() / 1000) * 1000).toISOString().replace(".000Z", "Z"); } function ingestSearchItems(cache: CacheFile, items: SearchCommitItem[], shas: Set) { for (const item of items) { shas.add(item.sha); cache.commits[item.sha] = { authorEmail: item.commit.author?.email ?? null, authorLogin: item.author?.login ?? null, authorName: item.commit.author?.name ?? null, committedAt: item.commit.author?.date ?? null, contributors: extractContributors(item), htmlUrl: item.html_url, repositoryFullName: item.repository.full_name, repositoryUrl: item.repository.html_url, sha: item.sha, }; } } function extractContributors(item: SearchCommitItem): ContributorRecord[] { const contributors = new Map(); const primaryAuthor = normalizeContributor({ email: item.commit.author?.email ?? null, login: item.author?.login ?? null, name: item.commit.author?.name ?? null, }); if (primaryAuthor) { contributors.set(primaryAuthor.key, primaryAuthor); } const coAuthorPattern = /^co-authored-by:\s*(.+?)\s*<([^>]+)>\s*$/gim; for (const match of item.commit.message.matchAll(coAuthorPattern)) { const contributor = normalizeContributor({ email: match[2] ?? null, login: null, name: match[1] ?? null, }); if (contributor) { contributors.set(contributor.key, contributor); } } return [...contributors.values()]; } function normalizeContributor(input: { email: string | null; login: string | null; name: string | null; }): ContributorRecord | null { const email = normalizeOptional(input.email); const login = normalizeOptional(input.login); const displayName = normalizeOptional(input.name) ?? login ?? email; if (!displayName && !email && !login) { return null; } if ((email && email === PAPERCLIP_EMAIL) || (displayName && displayName.toLowerCase() === PAPERCLIP_NAME)) { return null; } const key = login ? `login:${login}` : email ? `email:${email}` : `name:${displayName!.toLowerCase()}`; return { displayName: displayName ?? email ?? login ?? "unknown", email, key, login, }; } function normalizeOptional(value: string | null | undefined): string | null { const trimmed = value?.trim(); return trimmed ? trimmed : null; } function getRepoOwner(repositoryFullName: string): string { return repositoryFullName.split("/", 1)[0]?.toLowerCase() ?? ""; } async function enrichCommitStats( client: GitHubClient, cache: CacheFile, options: CliOptions, shas: string[], ): Promise { const pending = shas.filter((sha) => options.refreshStats || !cache.stats[sha]).slice(0, options.statsFetchLimit); let nextIndex = 0; let fetched = 0; const workers = Array.from({ length: Math.min(options.statsConcurrency, pending.length) }, async () => { while (true) { const currentIndex = nextIndex; nextIndex += 1; const sha = pending[currentIndex]; if (!sha) { return; } const commit = cache.commits[sha]; if (!commit) { continue; } const stats = await fetchCommitStats(client, commit.repositoryFullName, sha); cache.stats[sha] = { ...stats, fetchedAt: new Date().toISOString(), }; fetched += 1; } }); await Promise.all(workers); return fetched; } async function fetchCommitStats(client: GitHubClient, repositoryFullName: string, sha: string): Promise { const response = await client.getJson<{ stats?: CommitStats }>( `/repos/${repositoryFullName}/commits/${sha}`, ); return { additions: response.stats?.additions ?? 0, deletions: response.stats?.deletions ?? 0, total: response.stats?.total ?? 0, }; } function buildSummary(cache: CacheFile, options: CliOptions, shas: string[], fetchedThisRun: number): Summary { const repoNames = new Set(); const contributors = new Map(); let additions = 0; let deletions = 0; let coveredCommits = 0; for (const sha of shas) { const commit = cache.commits[sha]; if (!commit) { continue; } repoNames.add(commit.repositoryFullName); for (const contributor of commit.contributors) { contributors.set(contributor.key, contributor); } const stats = cache.stats[sha]; if (stats) { additions += stats.additions; deletions += stats.deletions; coveredCommits += 1; } } const contributorSample = [...contributors.values()] .sort((left, right) => left.displayName.localeCompare(right.displayName)) .slice(0, 10); const repoSample = [...repoNames].sort((left, right) => left.localeCompare(right)).slice(0, 10); return { cacheFile: options.cacheFile, contributors: { count: contributors.size, sample: contributorSample, }, detectedQuery: buildSearchQuery(options, options.start, options.end), lineStats: { additions, complete: coveredCommits === shas.length, coveredCommits, deletions, missingCommits: shas.length - coveredCommits, totalChanges: additions + deletions, }, range: { end: options.end.toISOString(), searchField: options.searchField, start: options.start.toISOString(), }, filters: { excludedOwners: [...options.excludeOwners].sort(), }, repos: { count: repoNames.size, sample: repoSample, }, statsFetch: { fetchedThisRun, skipped: options.skipStats, }, totals: { commits: shas.length, }, }; } function printSummary(summary: Summary) { console.log("Paperclip commit metrics"); console.log(`Query: ${summary.detectedQuery}`); console.log(`Range: ${summary.range.start} -> ${summary.range.end} (${summary.range.searchField})`); if (summary.filters.excludedOwners.length > 0) { console.log(`Excluded owners: ${summary.filters.excludedOwners.join(", ")}`); } console.log(`Commits: ${summary.totals.commits}`); console.log(`Distinct repos: ${summary.repos.count}`); console.log(`Distinct contributors: ${summary.contributors.count}`); console.log( `Line stats: +${summary.lineStats.additions} / -${summary.lineStats.deletions} / ${summary.lineStats.totalChanges} total`, ); console.log( `Line stat coverage: ${summary.lineStats.coveredCommits}/${summary.totals.commits}` + (summary.lineStats.complete ? " (complete)" : " (partial; rerun to hydrate more commits)"), ); console.log(`Stats fetched this run: ${summary.statsFetch.fetchedThisRun}${summary.statsFetch.skipped ? " (skipped)" : ""}`); console.log(`Cache: ${summary.cacheFile}`); if (summary.repos.sample.length > 0) { console.log(`Sample repos: ${summary.repos.sample.join(", ")}`); } if (summary.contributors.sample.length > 0) { console.log( `Sample contributors: ${summary.contributors.sample .map((contributor) => contributor.login ?? contributor.displayName) .join(", ")}`, ); } } async function writeExport( outputPath: string, format: CliOptions["exportFormat"], cache: CacheFile, shas: string[], summary: Summary, ): Promise { await fs.mkdir(path.dirname(outputPath), { recursive: true }); if (format === "json") { const report = { summary, commits: shas.map((sha) => buildExportRow(cache, sha)), }; await fs.writeFile(outputPath, JSON.stringify(report, null, 2), "utf8"); return; } const header = [ "committedAt", "repository", "repositoryUrl", "sha", "commitUrl", "authorLogin", "authorName", "authorEmail", "contributors", "additions", "deletions", "totalChanges", ]; const rows = [header.join(",")]; for (const sha of shas) { const row = buildExportRow(cache, sha); rows.push( [ row.committedAt, row.repository, row.repositoryUrl, row.sha, row.commitUrl, row.authorLogin, row.authorName, row.authorEmail, row.contributors, String(row.additions), String(row.deletions), String(row.totalChanges), ] .map(escapeCsv) .join(","), ); } await fs.writeFile(outputPath, `${rows.join("\n")}\n`, "utf8"); } function buildExportRow(cache: CacheFile, sha: string) { const commit = cache.commits[sha]; if (!commit) { throw new Error(`Missing cached commit for sha ${sha}`); } const stats = cache.stats[sha]; return { additions: stats?.additions ?? 0, authorEmail: commit.authorEmail ?? "", authorLogin: commit.authorLogin ?? "", authorName: commit.authorName ?? "", commitUrl: commit.htmlUrl, committedAt: commit.committedAt ?? "", contributors: commit.contributors.map((contributor) => contributor.login ?? contributor.displayName).join(" | "), deletions: stats?.deletions ?? 0, repository: commit.repositoryFullName, repositoryUrl: commit.repositoryUrl, sha: commit.sha, totalChanges: stats?.total ?? 0, }; } function escapeCsv(value: string): string { if (value.includes(",") || value.includes("\"") || value.includes("\n")) { return `"${value.replaceAll("\"", "\"\"")}"`; } return value; } function makeWindowKey(start: Date, end: Date): string { return `${start.toISOString()}..${end.toISOString()}`; } class GitHubClient { private readonly apiBase = "https://api.github.com"; private readonly token: string; constructor(token: string) { this.token = token; } async getJson(pathname: string): Promise { while (true) { const response = await fetch(`${this.apiBase}${pathname}`, { headers: { Accept: "application/vnd.github+json", Authorization: `Bearer ${this.token}`, "User-Agent": "paperclip-commit-metrics", "X-GitHub-Api-Version": "2022-11-28", }, }); if (response.ok) { return (await response.json()) as T; } const retryAfter = response.headers.get("retry-after"); if ((response.status === 403 || response.status === 429) && retryAfter) { const waitMs = Math.max(Number.parseInt(retryAfter, 10) * 1000, 1_000); console.error(`GitHub secondary rate limit hit for ${pathname}; waiting ${Math.ceil(waitMs / 1000)}s...`); await sleep(waitMs); continue; } const remaining = response.headers.get("x-ratelimit-remaining"); const resetAt = response.headers.get("x-ratelimit-reset"); if ((response.status === 403 || response.status === 429) && remaining === "0" && resetAt) { const waitMs = Math.max(Number.parseInt(resetAt, 10) * 1000 - Date.now() + 1_000, 1_000); console.error(`GitHub rate limit hit for ${pathname}; waiting ${Math.ceil(waitMs / 1000)}s...`); await sleep(waitMs); continue; } const body = await response.text(); throw new Error(`GitHub API request failed (${response.status}) for ${pathname}: ${body}`); } } } function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } main().catch((error) => { console.error(error instanceof Error ? error.message : String(error)); process.exit(1); });