Add Paperclip commit metrics script

Co-Authored-By: Paperclip <noreply@paperclip.ing>
This commit is contained in:
dotta
2026-03-26 16:56:12 -05:00
parent 6a72faf83b
commit 5d538d4792
2 changed files with 714 additions and 1 deletions

View File

@@ -32,7 +32,8 @@
"test:e2e:headed": "npx playwright test --config tests/e2e/playwright.config.ts --headed",
"evals:smoke": "cd evals/promptfoo && npx promptfoo@0.103.3 eval",
"test:release-smoke": "npx playwright test --config tests/release-smoke/playwright.config.ts",
"test:release-smoke:headed": "npx playwright test --config tests/release-smoke/playwright.config.ts --headed"
"test:release-smoke:headed": "npx playwright test --config tests/release-smoke/playwright.config.ts --headed",
"metrics:paperclip-commits": "tsx scripts/paperclip-commit-metrics.ts"
},
"devDependencies": {
"@playwright/test": "^1.58.2",

View File

@@ -0,0 +1,712 @@
#!/usr/bin/env npx tsx
import { execFile } from "node:child_process";
import { promises as fs } from "node:fs";
import path from "node:path";
import { promisify } from "node:util";
const execFileAsync = promisify(execFile);
const DEFAULT_QUERY = "\"Co-Authored-By: Paperclip <noreply@paperclip.ing>\"";
const DEFAULT_CACHE_FILE = path.resolve("data/paperclip-commit-metrics-cache.json");
const DEFAULT_SEARCH_START = "2008-01-01T00:00:00Z";
const SEARCH_WINDOW_LIMIT = 900;
const MIN_WINDOW_MS = 60_000;
const DEFAULT_STATS_FETCH_LIMIT = 250;
const DEFAULT_STATS_CONCURRENCY = 4;
const DEFAULT_SEARCH_FIELD = "committer-date";
const PAPERCLIP_EMAIL = "noreply@paperclip.ing";
const PAPERCLIP_NAME = "paperclip";
interface CliOptions {
cacheFile: string;
end: Date;
includePrivate: boolean;
json: boolean;
query: string;
refreshSearch: boolean;
refreshStats: boolean;
searchField: "author-date" | "committer-date";
start: Date;
statsConcurrency: number;
statsFetchLimit: number;
skipStats: boolean;
}
interface SearchCommitItem {
author: {
login?: string;
} | null;
commit: {
author: {
date: string;
email: string | null;
name: string | null;
} | null;
message: string;
};
html_url: string;
repository: {
full_name: string;
html_url: string;
};
sha: string;
}
interface CommitStats {
additions: number;
deletions: number;
total: number;
}
interface CachedCommit {
authorEmail: string | null;
authorLogin: string | null;
authorName: string | null;
committedAt: string | null;
contributors: ContributorRecord[];
htmlUrl: string;
repositoryFullName: string;
repositoryUrl: string;
sha: string;
}
interface CachedCommitStats extends CommitStats {
fetchedAt: string;
}
interface ContributorRecord {
displayName: string;
email: string | null;
key: string;
login: string | null;
}
interface WindowCacheEntry {
completedAt: string;
key: string;
shas: string[];
totalCount: number;
}
interface CacheFile {
commits: Record<string, CachedCommit>;
queryKey: string;
searchField: CliOptions["searchField"];
stats: Record<string, CachedCommitStats>;
updatedAt: string | null;
version: number;
windows: Record<string, WindowCacheEntry>;
}
interface SearchResponse {
incomplete_results: boolean;
items: SearchCommitItem[];
total_count: number;
}
interface SearchWindowResult {
shas: Set<string>;
totalCount: number;
}
interface Summary {
cacheFile: string;
contributors: {
count: number;
sample: ContributorRecord[];
};
detectedQuery: string;
lineStats: {
additions: number;
complete: boolean;
coveredCommits: number;
deletions: number;
missingCommits: number;
totalChanges: number;
};
range: {
end: string;
searchField: CliOptions["searchField"];
start: string;
};
repos: {
count: number;
sample: string[];
};
statsFetch: {
fetchedThisRun: number;
skipped: boolean;
};
totals: {
commits: number;
};
}
async function main() {
const options = parseArgs(process.argv.slice(2));
const cache = await loadCache(options.cacheFile, options);
const client = new GitHubClient(await resolveGitHubToken());
const { shas } = await searchWindow(client, cache, options, options.start, options.end);
const sortedShas = [...shas].sort();
let fetchedThisRun = 0;
if (!options.skipStats) {
fetchedThisRun = await enrichCommitStats(client, cache, options, sortedShas);
}
cache.updatedAt = new Date().toISOString();
await saveCache(options.cacheFile, cache);
const summary = buildSummary(cache, options, sortedShas, fetchedThisRun);
if (options.json) {
console.log(JSON.stringify(summary, null, 2));
return;
}
printSummary(summary);
}
function parseArgs(argv: string[]): CliOptions {
const options: CliOptions = {
cacheFile: DEFAULT_CACHE_FILE,
end: new Date(),
includePrivate: false,
json: false,
query: DEFAULT_QUERY,
refreshSearch: false,
refreshStats: false,
searchField: DEFAULT_SEARCH_FIELD,
start: new Date(DEFAULT_SEARCH_START),
statsConcurrency: DEFAULT_STATS_CONCURRENCY,
statsFetchLimit: DEFAULT_STATS_FETCH_LIMIT,
skipStats: false,
};
for (let index = 0; index < argv.length; index += 1) {
const arg = argv[index];
switch (arg) {
case "--cache-file":
options.cacheFile = requireValue(argv, ++index, arg);
break;
case "--end":
options.end = parseDateArg(requireValue(argv, ++index, arg), arg);
break;
case "--include-private":
options.includePrivate = true;
break;
case "--json":
options.json = true;
break;
case "--query":
options.query = requireValue(argv, ++index, arg);
break;
case "--refresh-search":
options.refreshSearch = true;
break;
case "--refresh-stats":
options.refreshStats = true;
break;
case "--search-field": {
const value = requireValue(argv, ++index, arg);
if (value !== "author-date" && value !== "committer-date") {
throw new Error(`Invalid --search-field value: ${value}`);
}
options.searchField = value;
break;
}
case "--skip-stats":
options.skipStats = true;
break;
case "--start":
options.start = parseDateArg(requireValue(argv, ++index, arg), arg);
break;
case "--stats-concurrency":
options.statsConcurrency = parsePositiveInt(requireValue(argv, ++index, arg), arg);
break;
case "--stats-fetch-limit":
options.statsFetchLimit = parseNonNegativeInt(requireValue(argv, ++index, arg), arg);
break;
case "--help":
printHelp();
process.exit(0);
break;
default:
throw new Error(`Unknown argument: ${arg}`);
}
}
if (Number.isNaN(options.start.getTime()) || Number.isNaN(options.end.getTime())) {
throw new Error("Invalid start or end date");
}
if (options.start >= options.end) {
throw new Error("--start must be earlier than --end");
}
return options;
}
function requireValue(argv: string[], index: number, flag: string): string {
const value = argv[index];
if (!value) {
throw new Error(`Missing value for ${flag}`);
}
return value;
}
function parseDateArg(value: string, flag: string): Date {
const parsed = new Date(value);
if (Number.isNaN(parsed.getTime())) {
throw new Error(`Invalid date for ${flag}: ${value}`);
}
return parsed;
}
function parsePositiveInt(value: string, flag: string): number {
const parsed = Number.parseInt(value, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
throw new Error(`Invalid positive integer for ${flag}: ${value}`);
}
return parsed;
}
function parseNonNegativeInt(value: string, flag: string): number {
const parsed = Number.parseInt(value, 10);
if (!Number.isFinite(parsed) || parsed < 0) {
throw new Error(`Invalid non-negative integer for ${flag}: ${value}`);
}
return parsed;
}
function printHelp() {
console.log(`Usage: tsx scripts/paperclip-commit-metrics.ts [options]
Options:
--start <date> ISO date/time lower bound (default: ${DEFAULT_SEARCH_START})
--end <date> ISO date/time upper bound (default: now)
--query <search> Commit search string (default: ${DEFAULT_QUERY})
--search-field <field> author-date | committer-date (default: ${DEFAULT_SEARCH_FIELD})
--include-private Include repos visible to the current token
--cache-file <path> Cache path (default: ${DEFAULT_CACHE_FILE})
--skip-stats Skip additions/deletions enrichment
--stats-fetch-limit <n> Max uncached commit stats to fetch this run (default: ${DEFAULT_STATS_FETCH_LIMIT})
--stats-concurrency <n> Parallel commit stat requests (default: ${DEFAULT_STATS_CONCURRENCY})
--refresh-search Ignore cached search windows
--refresh-stats Re-fetch cached commit stats
--json Print JSON summary
--help Show this help
`);
}
async function resolveGitHubToken(): Promise<string> {
const envToken = process.env.GITHUB_TOKEN ?? process.env.GH_TOKEN;
if (envToken) {
return envToken;
}
const { stdout } = await execFileAsync("gh", ["auth", "token"]);
const token = stdout.trim();
if (!token) {
throw new Error("Unable to resolve a GitHub token. Set GITHUB_TOKEN/GH_TOKEN or run `gh auth login`.");
}
return token;
}
async function loadCache(cacheFile: string, options: CliOptions): Promise<CacheFile> {
try {
const raw = await fs.readFile(cacheFile, "utf8");
const parsed = JSON.parse(raw) as CacheFile;
if (parsed.version !== 1 || parsed.queryKey !== buildQueryKey(options) || parsed.searchField !== options.searchField) {
return createEmptyCache(options);
}
return parsed;
} catch (error) {
if ((error as NodeJS.ErrnoException).code === "ENOENT") {
return createEmptyCache(options);
}
throw error;
}
}
function createEmptyCache(options: CliOptions): CacheFile {
return {
commits: {},
queryKey: buildQueryKey(options),
searchField: options.searchField,
stats: {},
updatedAt: null,
version: 1,
windows: {},
};
}
function buildQueryKey(options: CliOptions): string {
const visibility = options.includePrivate ? "all" : "public";
return JSON.stringify({
query: options.query,
searchField: options.searchField,
visibility,
});
}
async function saveCache(cacheFile: string, cache: CacheFile): Promise<void> {
await fs.mkdir(path.dirname(cacheFile), { recursive: true });
await fs.writeFile(cacheFile, JSON.stringify(cache, null, 2), "utf8");
}
async function searchWindow(
client: GitHubClient,
cache: CacheFile,
options: CliOptions,
start: Date,
end: Date,
): Promise<SearchWindowResult> {
const windowKey = makeWindowKey(start, end);
if (!options.refreshSearch) {
const cached = cache.windows[windowKey];
if (cached) {
return { shas: new Set(cached.shas), totalCount: cached.totalCount };
}
}
const firstPage = await searchPage(client, options, start, end, 1, 100);
if (firstPage.incomplete_results) {
throw new Error(`GitHub returned incomplete search results for window ${windowKey}`);
}
if (firstPage.total_count > SEARCH_WINDOW_LIMIT) {
const durationMs = end.getTime() - start.getTime();
if (durationMs <= MIN_WINDOW_MS) {
throw new Error(
`Search window ${windowKey} still has ${firstPage.total_count} results after splitting to ${durationMs}ms.`,
);
}
const midpoint = new Date(start.getTime() + Math.floor(durationMs / 2));
const left = await searchWindow(client, cache, options, start, midpoint);
const right = await searchWindow(client, cache, options, new Date(midpoint.getTime() + 1), end);
const shas = new Set([...left.shas, ...right.shas]);
cache.windows[windowKey] = {
completedAt: new Date().toISOString(),
key: windowKey,
shas: [...shas],
totalCount: shas.size,
};
return { shas, totalCount: shas.size };
}
const pageCount = Math.ceil(firstPage.total_count / 100);
const shas = new Set<string>();
ingestSearchItems(cache, firstPage.items, shas);
for (let page = 2; page <= pageCount; page += 1) {
const response = await searchPage(client, options, start, end, page, 100);
ingestSearchItems(cache, response.items, shas);
}
cache.windows[windowKey] = {
completedAt: new Date().toISOString(),
key: windowKey,
shas: [...shas],
totalCount: firstPage.total_count,
};
return { shas, totalCount: firstPage.total_count };
}
async function searchPage(
client: GitHubClient,
options: CliOptions,
start: Date,
end: Date,
page: number,
perPage: number,
): Promise<SearchResponse> {
const searchQuery = buildSearchQuery(options, start, end);
const params = new URLSearchParams({
page: String(page),
per_page: String(perPage),
q: searchQuery,
});
return client.getJson<SearchResponse>(`/search/commits?${params.toString()}`);
}
function buildSearchQuery(options: CliOptions, start: Date, end: Date): string {
const qualifiers = [`${options.searchField}:${formatQueryDate(start)}..${formatQueryDate(end)}`];
if (!options.includePrivate) {
qualifiers.push("is:public");
}
return `${options.query} ${qualifiers.join(" ")}`.trim();
}
function formatQueryDate(value: Date): string {
return value.toISOString().replace(".000Z", "Z");
}
function ingestSearchItems(cache: CacheFile, items: SearchCommitItem[], shas: Set<string>) {
for (const item of items) {
shas.add(item.sha);
cache.commits[item.sha] = {
authorEmail: item.commit.author?.email ?? null,
authorLogin: item.author?.login ?? null,
authorName: item.commit.author?.name ?? null,
committedAt: item.commit.author?.date ?? null,
contributors: extractContributors(item),
htmlUrl: item.html_url,
repositoryFullName: item.repository.full_name,
repositoryUrl: item.repository.html_url,
sha: item.sha,
};
}
}
function extractContributors(item: SearchCommitItem): ContributorRecord[] {
const contributors = new Map<string, ContributorRecord>();
const primaryAuthor = normalizeContributor({
email: item.commit.author?.email ?? null,
login: item.author?.login ?? null,
name: item.commit.author?.name ?? null,
});
if (primaryAuthor) {
contributors.set(primaryAuthor.key, primaryAuthor);
}
const coAuthorPattern = /^co-authored-by:\s*(.+?)\s*<([^>]+)>\s*$/gim;
for (const match of item.commit.message.matchAll(coAuthorPattern)) {
const contributor = normalizeContributor({
email: match[2] ?? null,
login: null,
name: match[1] ?? null,
});
if (contributor) {
contributors.set(contributor.key, contributor);
}
}
return [...contributors.values()];
}
function normalizeContributor(input: {
email: string | null;
login: string | null;
name: string | null;
}): ContributorRecord | null {
const email = normalizeOptional(input.email);
const login = normalizeOptional(input.login);
const displayName = normalizeOptional(input.name) ?? login ?? email;
if (!displayName && !email && !login) {
return null;
}
if ((email && email === PAPERCLIP_EMAIL) || (displayName && displayName.toLowerCase() === PAPERCLIP_NAME)) {
return null;
}
const key = login ? `login:${login}` : email ? `email:${email}` : `name:${displayName!.toLowerCase()}`;
return {
displayName: displayName ?? email ?? login ?? "unknown",
email,
key,
login,
};
}
function normalizeOptional(value: string | null | undefined): string | null {
const trimmed = value?.trim();
return trimmed ? trimmed : null;
}
async function enrichCommitStats(
client: GitHubClient,
cache: CacheFile,
options: CliOptions,
shas: string[],
): Promise<number> {
const pending = shas.filter((sha) => options.refreshStats || !cache.stats[sha]).slice(0, options.statsFetchLimit);
let nextIndex = 0;
let fetched = 0;
const workers = Array.from({ length: Math.min(options.statsConcurrency, pending.length) }, async () => {
while (true) {
const currentIndex = nextIndex;
nextIndex += 1;
const sha = pending[currentIndex];
if (!sha) {
return;
}
const commit = cache.commits[sha];
if (!commit) {
continue;
}
const stats = await fetchCommitStats(client, commit.repositoryFullName, sha);
cache.stats[sha] = {
...stats,
fetchedAt: new Date().toISOString(),
};
fetched += 1;
}
});
await Promise.all(workers);
return fetched;
}
async function fetchCommitStats(client: GitHubClient, repositoryFullName: string, sha: string): Promise<CommitStats> {
const response = await client.getJson<{ stats?: CommitStats }>(
`/repos/${repositoryFullName}/commits/${sha}`,
);
return {
additions: response.stats?.additions ?? 0,
deletions: response.stats?.deletions ?? 0,
total: response.stats?.total ?? 0,
};
}
function buildSummary(cache: CacheFile, options: CliOptions, shas: string[], fetchedThisRun: number): Summary {
const repoNames = new Set<string>();
const contributors = new Map<string, ContributorRecord>();
let additions = 0;
let deletions = 0;
let coveredCommits = 0;
for (const sha of shas) {
const commit = cache.commits[sha];
if (!commit) {
continue;
}
repoNames.add(commit.repositoryFullName);
for (const contributor of commit.contributors) {
contributors.set(contributor.key, contributor);
}
const stats = cache.stats[sha];
if (stats) {
additions += stats.additions;
deletions += stats.deletions;
coveredCommits += 1;
}
}
const contributorSample = [...contributors.values()]
.sort((left, right) => left.displayName.localeCompare(right.displayName))
.slice(0, 10);
const repoSample = [...repoNames].sort((left, right) => left.localeCompare(right)).slice(0, 10);
return {
cacheFile: options.cacheFile,
contributors: {
count: contributors.size,
sample: contributorSample,
},
detectedQuery: buildSearchQuery(options, options.start, options.end),
lineStats: {
additions,
complete: coveredCommits === shas.length,
coveredCommits,
deletions,
missingCommits: shas.length - coveredCommits,
totalChanges: additions + deletions,
},
range: {
end: options.end.toISOString(),
searchField: options.searchField,
start: options.start.toISOString(),
},
repos: {
count: repoNames.size,
sample: repoSample,
},
statsFetch: {
fetchedThisRun,
skipped: options.skipStats,
},
totals: {
commits: shas.length,
},
};
}
function printSummary(summary: Summary) {
console.log("Paperclip commit metrics");
console.log(`Query: ${summary.detectedQuery}`);
console.log(`Range: ${summary.range.start} -> ${summary.range.end} (${summary.range.searchField})`);
console.log(`Commits: ${summary.totals.commits}`);
console.log(`Distinct repos: ${summary.repos.count}`);
console.log(`Distinct contributors: ${summary.contributors.count}`);
console.log(
`Line stats: +${summary.lineStats.additions} / -${summary.lineStats.deletions} / ${summary.lineStats.totalChanges} total`,
);
console.log(
`Line stat coverage: ${summary.lineStats.coveredCommits}/${summary.totals.commits}` +
(summary.lineStats.complete ? " (complete)" : " (partial; rerun to hydrate more commits)"),
);
console.log(`Stats fetched this run: ${summary.statsFetch.fetchedThisRun}${summary.statsFetch.skipped ? " (skipped)" : ""}`);
console.log(`Cache: ${summary.cacheFile}`);
if (summary.repos.sample.length > 0) {
console.log(`Sample repos: ${summary.repos.sample.join(", ")}`);
}
if (summary.contributors.sample.length > 0) {
console.log(
`Sample contributors: ${summary.contributors.sample
.map((contributor) => contributor.login ?? contributor.displayName)
.join(", ")}`,
);
}
}
function makeWindowKey(start: Date, end: Date): string {
return `${start.toISOString()}..${end.toISOString()}`;
}
class GitHubClient {
private readonly apiBase = "https://api.github.com";
private readonly token: string;
constructor(token: string) {
this.token = token;
}
async getJson<T>(pathname: string): Promise<T> {
while (true) {
const response = await fetch(`${this.apiBase}${pathname}`, {
headers: {
Accept: "application/vnd.github+json",
Authorization: `Bearer ${this.token}`,
"User-Agent": "paperclip-commit-metrics",
"X-GitHub-Api-Version": "2022-11-28",
},
});
if (response.ok) {
return (await response.json()) as T;
}
const remaining = response.headers.get("x-ratelimit-remaining");
const resetAt = response.headers.get("x-ratelimit-reset");
if ((response.status === 403 || response.status === 429) && remaining === "0" && resetAt) {
const waitMs = Math.max(Number.parseInt(resetAt, 10) * 1000 - Date.now() + 1_000, 1_000);
console.error(`GitHub rate limit hit for ${pathname}; waiting ${Math.ceil(waitMs / 1000)}s...`);
await sleep(waitMs);
continue;
}
const body = await response.text();
throw new Error(`GitHub API request failed (${response.status}) for ${pathname}: ${body}`);
}
}
}
function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
main().catch((error) => {
console.error(error instanceof Error ? error.message : String(error));
process.exit(1);
});