Files
dotfiles/pi/files/agent/extensions/pi-web-access/github-extract.ts
2026-02-19 22:23:48 +00:00

506 lines
14 KiB
TypeScript

import { existsSync, readFileSync, rmSync, statSync, readdirSync, openSync, readSync, closeSync } from "node:fs";
import { execFile } from "node:child_process";
import { homedir } from "node:os";
import { join, extname } from "node:path";
import { activityMonitor } from "./activity.js";
import type { ExtractedContent } from "./extract.js";
import { checkGhAvailable, checkRepoSize, fetchViaApi, showGhHint } from "./github-api.js";
const CONFIG_PATH = join(homedir(), ".pi", "web-search.json");
const BINARY_EXTENSIONS = new Set([
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".svg", ".tiff", ".tif",
".mp3", ".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".wav", ".ogg", ".webm", ".flac", ".aac",
".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".zst",
".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a", ".lib",
".woff", ".woff2", ".ttf", ".otf", ".eot",
".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
".sqlite", ".db", ".sqlite3",
".pyc", ".pyo", ".class", ".jar", ".war",
".iso", ".img", ".dmg",
]);
const NOISE_DIRS = new Set([
"node_modules", "vendor", ".next", "dist", "build", "__pycache__",
".venv", "venv", ".tox", ".mypy_cache", ".pytest_cache",
"target", ".gradle", ".idea", ".vscode",
]);
const MAX_INLINE_FILE_CHARS = 100_000;
const MAX_TREE_ENTRIES = 200;
export interface GitHubUrlInfo {
owner: string;
repo: string;
ref?: string;
refIsFullSha: boolean;
path?: string;
type: "root" | "blob" | "tree";
}
interface CachedClone {
localPath: string;
clonePromise: Promise<string | null>;
}
interface GitHubCloneConfig {
enabled: boolean;
maxRepoSizeMB: number;
cloneTimeoutSeconds: number;
clonePath: string;
}
const cloneCache = new Map<string, CachedClone>();
let cachedConfig: GitHubCloneConfig | null = null;
function loadGitHubConfig(): GitHubCloneConfig {
if (cachedConfig) return cachedConfig;
const defaults: GitHubCloneConfig = {
enabled: true,
maxRepoSizeMB: 350,
cloneTimeoutSeconds: 30,
clonePath: "/tmp/pi-github-repos",
};
try {
if (existsSync(CONFIG_PATH)) {
const raw = JSON.parse(readFileSync(CONFIG_PATH, "utf-8"));
const gc = raw.githubClone ?? {};
cachedConfig = {
enabled: gc.enabled ?? defaults.enabled,
maxRepoSizeMB: gc.maxRepoSizeMB ?? defaults.maxRepoSizeMB,
cloneTimeoutSeconds: gc.cloneTimeoutSeconds ?? defaults.cloneTimeoutSeconds,
clonePath: gc.clonePath ?? defaults.clonePath,
};
return cachedConfig;
}
} catch {
// ignore parse errors
}
cachedConfig = defaults;
return cachedConfig;
}
const NON_CODE_SEGMENTS = new Set([
"issues", "pull", "pulls", "discussions", "releases", "wiki",
"actions", "settings", "security", "projects", "graphs",
"compare", "commits", "tags", "branches", "stargazers",
"watchers", "network", "forks", "milestone", "labels",
"packages", "codespaces", "contribute", "community",
"sponsors", "invitations", "notifications", "insights",
]);
export function parseGitHubUrl(url: string): GitHubUrlInfo | null {
let parsed: URL;
try {
parsed = new URL(url);
} catch {
return null;
}
if (parsed.hostname !== "github.com") return null;
const segments = parsed.pathname.split("/").filter(Boolean);
if (segments.length < 2) return null;
const owner = segments[0];
const repo = segments[1].replace(/\.git$/, "");
if (NON_CODE_SEGMENTS.has(segments[2]?.toLowerCase())) return null;
if (segments.length === 2) {
return { owner, repo, refIsFullSha: false, type: "root" };
}
const action = segments[2];
if (action !== "blob" && action !== "tree") return null;
if (segments.length < 4) return null;
const ref = segments[3];
const refIsFullSha = /^[0-9a-f]{40}$/.test(ref);
const pathParts = segments.slice(4);
const path = pathParts.length > 0 ? pathParts.join("/") : "";
return {
owner,
repo,
ref,
refIsFullSha,
path,
type: action as "blob" | "tree",
};
}
function cacheKey(owner: string, repo: string, ref?: string): string {
return ref ? `${owner}/${repo}@${ref}` : `${owner}/${repo}`;
}
function cloneDir(config: GitHubCloneConfig, owner: string, repo: string, ref?: string): string {
const dirName = ref ? `${repo}@${ref}` : repo;
return join(config.clonePath, owner, dirName);
}
function execClone(args: string[], localPath: string, timeoutMs: number, signal?: AbortSignal): Promise<string | null> {
return new Promise((resolve) => {
const child = execFile(args[0], args.slice(1), { timeout: timeoutMs }, (err) => {
if (err) {
try {
rmSync(localPath, { recursive: true, force: true });
} catch { /* ignore */ }
resolve(null);
return;
}
resolve(localPath);
});
if (signal) {
const onAbort = () => child.kill();
signal.addEventListener("abort", onAbort, { once: true });
child.on("exit", () => signal.removeEventListener("abort", onAbort));
}
});
}
async function cloneRepo(
owner: string,
repo: string,
ref: string | undefined,
config: GitHubCloneConfig,
signal?: AbortSignal,
): Promise<string | null> {
const localPath = cloneDir(config, owner, repo, ref);
try {
rmSync(localPath, { recursive: true, force: true });
} catch { /* ignore */ }
const timeoutMs = config.cloneTimeoutSeconds * 1000;
const hasGh = await checkGhAvailable();
if (hasGh) {
const args = ["gh", "repo", "clone", `${owner}/${repo}`, localPath, "--", "--depth", "1", "--single-branch"];
if (ref) args.push("--branch", ref);
return execClone(args, localPath, timeoutMs, signal);
}
showGhHint();
const gitUrl = `https://github.com/${owner}/${repo}.git`;
const args = ["git", "clone", "--depth", "1", "--single-branch"];
if (ref) args.push("--branch", ref);
args.push(gitUrl, localPath);
return execClone(args, localPath, timeoutMs, signal);
}
function isBinaryFile(filePath: string): boolean {
const ext = extname(filePath).toLowerCase();
if (BINARY_EXTENSIONS.has(ext)) return true;
let fd: number;
try {
fd = openSync(filePath, "r");
} catch {
return false;
}
try {
const buf = Buffer.alloc(512);
const bytesRead = readSync(fd, buf, 0, 512, 0);
for (let i = 0; i < bytesRead; i++) {
if (buf[i] === 0) return true;
}
} catch {
return false;
} finally {
closeSync(fd);
}
return false;
}
function formatFileSize(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
}
function buildTree(rootPath: string): string {
const entries: string[] = [];
function walk(dir: string, relPath: string): void {
if (entries.length >= MAX_TREE_ENTRIES) return;
let items: string[];
try {
items = readdirSync(dir).sort();
} catch {
return;
}
for (const item of items) {
if (entries.length >= MAX_TREE_ENTRIES) return;
if (item === ".git") continue;
const fullPath = join(dir, item);
let stat;
try {
stat = statSync(fullPath);
} catch {
continue;
}
const rel = relPath ? `${relPath}/${item}` : item;
if (stat.isDirectory()) {
if (NOISE_DIRS.has(item)) {
entries.push(`${rel}/ [skipped]`);
continue;
}
entries.push(`${rel}/`);
walk(fullPath, rel);
} else {
entries.push(rel);
}
}
}
walk(rootPath, "");
if (entries.length >= MAX_TREE_ENTRIES) {
entries.push(`... (truncated at ${MAX_TREE_ENTRIES} entries)`);
}
return entries.join("\n");
}
function buildDirListing(rootPath: string, subPath: string): string {
const targetPath = join(rootPath, subPath);
const lines: string[] = [];
let items: string[];
try {
items = readdirSync(targetPath).sort();
} catch {
return "(directory not readable)";
}
for (const item of items) {
if (item === ".git") continue;
const fullPath = join(targetPath, item);
try {
const stat = statSync(fullPath);
if (stat.isDirectory()) {
lines.push(` ${item}/`);
} else {
lines.push(` ${item} (${formatFileSize(stat.size)})`);
}
} catch {
lines.push(` ${item} (unreadable)`);
}
}
return lines.join("\n");
}
function readReadme(localPath: string): string | null {
const candidates = ["README.md", "readme.md", "README", "README.txt", "README.rst"];
for (const name of candidates) {
const readmePath = join(localPath, name);
if (existsSync(readmePath)) {
try {
const content = readFileSync(readmePath, "utf-8");
return content.length > 8192 ? content.slice(0, 8192) + "\n\n[README truncated at 8K chars]" : content;
} catch {
return null;
}
}
}
return null;
}
function generateContent(localPath: string, info: GitHubUrlInfo): string {
const lines: string[] = [];
lines.push(`Repository cloned to: ${localPath}`);
lines.push("");
if (info.type === "root") {
lines.push("## Structure");
lines.push(buildTree(localPath));
lines.push("");
const readme = readReadme(localPath);
if (readme) {
lines.push("## README.md");
lines.push(readme);
lines.push("");
}
lines.push("Use `read` and `bash` tools at the path above to explore further.");
return lines.join("\n");
}
if (info.type === "tree") {
const dirPath = info.path || "";
const fullDirPath = join(localPath, dirPath);
if (!existsSync(fullDirPath)) {
lines.push(`Path \`${dirPath}\` not found in clone. Showing repository root instead.`);
lines.push("");
lines.push("## Structure");
lines.push(buildTree(localPath));
} else {
lines.push(`## ${dirPath || "/"}`);
lines.push(buildDirListing(localPath, dirPath));
}
lines.push("");
lines.push("Use `read` and `bash` tools at the path above to explore further.");
return lines.join("\n");
}
if (info.type === "blob") {
const filePath = info.path || "";
const fullFilePath = join(localPath, filePath);
if (!existsSync(fullFilePath)) {
lines.push(`Path \`${filePath}\` not found in clone. Showing repository root instead.`);
lines.push("");
lines.push("## Structure");
lines.push(buildTree(localPath));
lines.push("");
lines.push("Use `read` and `bash` tools at the path above to explore further.");
return lines.join("\n");
}
const stat = statSync(fullFilePath);
if (stat.isDirectory()) {
lines.push(`## ${filePath || "/"}`);
lines.push(buildDirListing(localPath, filePath));
lines.push("");
lines.push("Use `read` and `bash` tools at the path above to explore further.");
return lines.join("\n");
}
if (isBinaryFile(fullFilePath)) {
const ext = extname(filePath).replace(".", "");
lines.push(`## ${filePath}`);
lines.push(`Binary file (${ext}, ${formatFileSize(stat.size)}). Use \`read\` or \`bash\` tools at the path above to inspect.`);
return lines.join("\n");
}
const content = readFileSync(fullFilePath, "utf-8");
lines.push(`## ${filePath}`);
if (content.length > MAX_INLINE_FILE_CHARS) {
lines.push(content.slice(0, MAX_INLINE_FILE_CHARS));
lines.push("");
lines.push(`[File truncated at 100K chars. Full file: ${fullFilePath}]`);
} else {
lines.push(content);
}
lines.push("");
lines.push("Use `read` and `bash` tools at the path above to explore further.");
return lines.join("\n");
}
return lines.join("\n");
}
async function awaitCachedClone(
cached: CachedClone,
url: string,
owner: string,
repo: string,
info: GitHubUrlInfo,
signal?: AbortSignal,
): Promise<ExtractedContent | null> {
if (signal?.aborted) return fetchViaApi(url, owner, repo, info);
const result = await cached.clonePromise;
if (signal?.aborted) return fetchViaApi(url, owner, repo, info);
if (result) {
const content = generateContent(result, info);
const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`;
return { url, title, content, error: null };
}
return fetchViaApi(url, owner, repo, info);
}
export async function extractGitHub(
url: string,
signal?: AbortSignal,
forceClone?: boolean,
): Promise<ExtractedContent | null> {
const info = parseGitHubUrl(url);
if (!info) return null;
const config = loadGitHubConfig();
if (!config.enabled) return null;
const { owner, repo } = info;
const key = cacheKey(owner, repo, info.ref);
const cached = cloneCache.get(key);
if (cached) return awaitCachedClone(cached, url, owner, repo, info, signal);
if (info.refIsFullSha) {
const sizeNote = `Note: Commit SHA URLs use the GitHub API instead of cloning.`;
return fetchViaApi(url, owner, repo, info, sizeNote);
}
const activityId = activityMonitor.logStart({ type: "fetch", url: `github.com/${owner}/${repo}` });
if (!forceClone) {
const sizeKB = await checkRepoSize(owner, repo);
if (sizeKB !== null) {
const sizeMB = sizeKB / 1024;
if (sizeMB > config.maxRepoSizeMB) {
activityMonitor.logComplete(activityId, 200);
const sizeNote =
`Note: Repository is ${Math.round(sizeMB)}MB (threshold: ${config.maxRepoSizeMB}MB). ` +
`Showing API-fetched content instead of full clone. Ask the user if they'd like to clone the full repo -- ` +
`if yes, call fetch_content again with the same URL and add forceClone: true to the params.`;
return fetchViaApi(url, owner, repo, info, sizeNote);
}
}
}
// Re-check: another concurrent caller may have started a clone while we awaited the size check
const cachedAfterSizeCheck = cloneCache.get(key);
if (cachedAfterSizeCheck) return awaitCachedClone(cachedAfterSizeCheck, url, owner, repo, info, signal);
const clonePromise = cloneRepo(owner, repo, info.ref, config, signal);
const localPath = cloneDir(config, owner, repo, info.ref);
cloneCache.set(key, { localPath, clonePromise });
const result = await clonePromise;
if (!result) {
cloneCache.delete(key);
activityMonitor.logError(activityId, "clone failed");
const apiFallback = await fetchViaApi(url, owner, repo, info);
if (apiFallback) return apiFallback;
return null;
}
activityMonitor.logComplete(activityId, 200);
const content = generateContent(result, info);
const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`;
return { url, title, content, error: null };
}
export function clearCloneCache(): void {
for (const entry of cloneCache.values()) {
try {
rmSync(entry.localPath, { recursive: true, force: true });
} catch { /* ignore */ }
}
cloneCache.clear();
cachedConfig = null;
}