import { existsSync, readFileSync, rmSync, statSync, readdirSync, openSync, readSync, closeSync } from "node:fs"; import { execFile } from "node:child_process"; import { homedir } from "node:os"; import { join, extname } from "node:path"; import { activityMonitor } from "./activity.js"; import type { ExtractedContent } from "./extract.js"; import { checkGhAvailable, checkRepoSize, fetchViaApi, showGhHint } from "./github-api.js"; const CONFIG_PATH = join(homedir(), ".pi", "web-search.json"); const BINARY_EXTENSIONS = new Set([ ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".svg", ".tiff", ".tif", ".mp3", ".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".wav", ".ogg", ".webm", ".flac", ".aac", ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".zst", ".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a", ".lib", ".woff", ".woff2", ".ttf", ".otf", ".eot", ".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx", ".sqlite", ".db", ".sqlite3", ".pyc", ".pyo", ".class", ".jar", ".war", ".iso", ".img", ".dmg", ]); const NOISE_DIRS = new Set([ "node_modules", "vendor", ".next", "dist", "build", "__pycache__", ".venv", "venv", ".tox", ".mypy_cache", ".pytest_cache", "target", ".gradle", ".idea", ".vscode", ]); const MAX_INLINE_FILE_CHARS = 100_000; const MAX_TREE_ENTRIES = 200; export interface GitHubUrlInfo { owner: string; repo: string; ref?: string; refIsFullSha: boolean; path?: string; type: "root" | "blob" | "tree"; } interface CachedClone { localPath: string; clonePromise: Promise; } interface GitHubCloneConfig { enabled: boolean; maxRepoSizeMB: number; cloneTimeoutSeconds: number; clonePath: string; } const cloneCache = new Map(); let cachedConfig: GitHubCloneConfig | null = null; function loadGitHubConfig(): GitHubCloneConfig { if (cachedConfig) return cachedConfig; const defaults: GitHubCloneConfig = { enabled: true, maxRepoSizeMB: 350, cloneTimeoutSeconds: 30, clonePath: "/tmp/pi-github-repos", }; try { if (existsSync(CONFIG_PATH)) { const raw = JSON.parse(readFileSync(CONFIG_PATH, "utf-8")); const gc = raw.githubClone ?? {}; cachedConfig = { enabled: gc.enabled ?? defaults.enabled, maxRepoSizeMB: gc.maxRepoSizeMB ?? defaults.maxRepoSizeMB, cloneTimeoutSeconds: gc.cloneTimeoutSeconds ?? defaults.cloneTimeoutSeconds, clonePath: gc.clonePath ?? defaults.clonePath, }; return cachedConfig; } } catch { // ignore parse errors } cachedConfig = defaults; return cachedConfig; } const NON_CODE_SEGMENTS = new Set([ "issues", "pull", "pulls", "discussions", "releases", "wiki", "actions", "settings", "security", "projects", "graphs", "compare", "commits", "tags", "branches", "stargazers", "watchers", "network", "forks", "milestone", "labels", "packages", "codespaces", "contribute", "community", "sponsors", "invitations", "notifications", "insights", ]); export function parseGitHubUrl(url: string): GitHubUrlInfo | null { let parsed: URL; try { parsed = new URL(url); } catch { return null; } if (parsed.hostname !== "github.com") return null; const segments = parsed.pathname.split("/").filter(Boolean); if (segments.length < 2) return null; const owner = segments[0]; const repo = segments[1].replace(/\.git$/, ""); if (NON_CODE_SEGMENTS.has(segments[2]?.toLowerCase())) return null; if (segments.length === 2) { return { owner, repo, refIsFullSha: false, type: "root" }; } const action = segments[2]; if (action !== "blob" && action !== "tree") return null; if (segments.length < 4) return null; const ref = segments[3]; const refIsFullSha = /^[0-9a-f]{40}$/.test(ref); const pathParts = segments.slice(4); const path = pathParts.length > 0 ? pathParts.join("/") : ""; return { owner, repo, ref, refIsFullSha, path, type: action as "blob" | "tree", }; } function cacheKey(owner: string, repo: string, ref?: string): string { return ref ? `${owner}/${repo}@${ref}` : `${owner}/${repo}`; } function cloneDir(config: GitHubCloneConfig, owner: string, repo: string, ref?: string): string { const dirName = ref ? `${repo}@${ref}` : repo; return join(config.clonePath, owner, dirName); } function execClone(args: string[], localPath: string, timeoutMs: number, signal?: AbortSignal): Promise { return new Promise((resolve) => { const child = execFile(args[0], args.slice(1), { timeout: timeoutMs }, (err) => { if (err) { try { rmSync(localPath, { recursive: true, force: true }); } catch { /* ignore */ } resolve(null); return; } resolve(localPath); }); if (signal) { const onAbort = () => child.kill(); signal.addEventListener("abort", onAbort, { once: true }); child.on("exit", () => signal.removeEventListener("abort", onAbort)); } }); } async function cloneRepo( owner: string, repo: string, ref: string | undefined, config: GitHubCloneConfig, signal?: AbortSignal, ): Promise { const localPath = cloneDir(config, owner, repo, ref); try { rmSync(localPath, { recursive: true, force: true }); } catch { /* ignore */ } const timeoutMs = config.cloneTimeoutSeconds * 1000; const hasGh = await checkGhAvailable(); if (hasGh) { const args = ["gh", "repo", "clone", `${owner}/${repo}`, localPath, "--", "--depth", "1", "--single-branch"]; if (ref) args.push("--branch", ref); return execClone(args, localPath, timeoutMs, signal); } showGhHint(); const gitUrl = `https://github.com/${owner}/${repo}.git`; const args = ["git", "clone", "--depth", "1", "--single-branch"]; if (ref) args.push("--branch", ref); args.push(gitUrl, localPath); return execClone(args, localPath, timeoutMs, signal); } function isBinaryFile(filePath: string): boolean { const ext = extname(filePath).toLowerCase(); if (BINARY_EXTENSIONS.has(ext)) return true; let fd: number; try { fd = openSync(filePath, "r"); } catch { return false; } try { const buf = Buffer.alloc(512); const bytesRead = readSync(fd, buf, 0, 512, 0); for (let i = 0; i < bytesRead; i++) { if (buf[i] === 0) return true; } } catch { return false; } finally { closeSync(fd); } return false; } function formatFileSize(bytes: number): string { if (bytes < 1024) return `${bytes} B`; if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`; return `${(bytes / (1024 * 1024)).toFixed(1)} MB`; } function buildTree(rootPath: string): string { const entries: string[] = []; function walk(dir: string, relPath: string): void { if (entries.length >= MAX_TREE_ENTRIES) return; let items: string[]; try { items = readdirSync(dir).sort(); } catch { return; } for (const item of items) { if (entries.length >= MAX_TREE_ENTRIES) return; if (item === ".git") continue; const fullPath = join(dir, item); let stat; try { stat = statSync(fullPath); } catch { continue; } const rel = relPath ? `${relPath}/${item}` : item; if (stat.isDirectory()) { if (NOISE_DIRS.has(item)) { entries.push(`${rel}/ [skipped]`); continue; } entries.push(`${rel}/`); walk(fullPath, rel); } else { entries.push(rel); } } } walk(rootPath, ""); if (entries.length >= MAX_TREE_ENTRIES) { entries.push(`... (truncated at ${MAX_TREE_ENTRIES} entries)`); } return entries.join("\n"); } function buildDirListing(rootPath: string, subPath: string): string { const targetPath = join(rootPath, subPath); const lines: string[] = []; let items: string[]; try { items = readdirSync(targetPath).sort(); } catch { return "(directory not readable)"; } for (const item of items) { if (item === ".git") continue; const fullPath = join(targetPath, item); try { const stat = statSync(fullPath); if (stat.isDirectory()) { lines.push(` ${item}/`); } else { lines.push(` ${item} (${formatFileSize(stat.size)})`); } } catch { lines.push(` ${item} (unreadable)`); } } return lines.join("\n"); } function readReadme(localPath: string): string | null { const candidates = ["README.md", "readme.md", "README", "README.txt", "README.rst"]; for (const name of candidates) { const readmePath = join(localPath, name); if (existsSync(readmePath)) { try { const content = readFileSync(readmePath, "utf-8"); return content.length > 8192 ? content.slice(0, 8192) + "\n\n[README truncated at 8K chars]" : content; } catch { return null; } } } return null; } function generateContent(localPath: string, info: GitHubUrlInfo): string { const lines: string[] = []; lines.push(`Repository cloned to: ${localPath}`); lines.push(""); if (info.type === "root") { lines.push("## Structure"); lines.push(buildTree(localPath)); lines.push(""); const readme = readReadme(localPath); if (readme) { lines.push("## README.md"); lines.push(readme); lines.push(""); } lines.push("Use `read` and `bash` tools at the path above to explore further."); return lines.join("\n"); } if (info.type === "tree") { const dirPath = info.path || ""; const fullDirPath = join(localPath, dirPath); if (!existsSync(fullDirPath)) { lines.push(`Path \`${dirPath}\` not found in clone. Showing repository root instead.`); lines.push(""); lines.push("## Structure"); lines.push(buildTree(localPath)); } else { lines.push(`## ${dirPath || "/"}`); lines.push(buildDirListing(localPath, dirPath)); } lines.push(""); lines.push("Use `read` and `bash` tools at the path above to explore further."); return lines.join("\n"); } if (info.type === "blob") { const filePath = info.path || ""; const fullFilePath = join(localPath, filePath); if (!existsSync(fullFilePath)) { lines.push(`Path \`${filePath}\` not found in clone. Showing repository root instead.`); lines.push(""); lines.push("## Structure"); lines.push(buildTree(localPath)); lines.push(""); lines.push("Use `read` and `bash` tools at the path above to explore further."); return lines.join("\n"); } const stat = statSync(fullFilePath); if (stat.isDirectory()) { lines.push(`## ${filePath || "/"}`); lines.push(buildDirListing(localPath, filePath)); lines.push(""); lines.push("Use `read` and `bash` tools at the path above to explore further."); return lines.join("\n"); } if (isBinaryFile(fullFilePath)) { const ext = extname(filePath).replace(".", ""); lines.push(`## ${filePath}`); lines.push(`Binary file (${ext}, ${formatFileSize(stat.size)}). Use \`read\` or \`bash\` tools at the path above to inspect.`); return lines.join("\n"); } const content = readFileSync(fullFilePath, "utf-8"); lines.push(`## ${filePath}`); if (content.length > MAX_INLINE_FILE_CHARS) { lines.push(content.slice(0, MAX_INLINE_FILE_CHARS)); lines.push(""); lines.push(`[File truncated at 100K chars. Full file: ${fullFilePath}]`); } else { lines.push(content); } lines.push(""); lines.push("Use `read` and `bash` tools at the path above to explore further."); return lines.join("\n"); } return lines.join("\n"); } async function awaitCachedClone( cached: CachedClone, url: string, owner: string, repo: string, info: GitHubUrlInfo, signal?: AbortSignal, ): Promise { if (signal?.aborted) return fetchViaApi(url, owner, repo, info); const result = await cached.clonePromise; if (signal?.aborted) return fetchViaApi(url, owner, repo, info); if (result) { const content = generateContent(result, info); const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`; return { url, title, content, error: null }; } return fetchViaApi(url, owner, repo, info); } export async function extractGitHub( url: string, signal?: AbortSignal, forceClone?: boolean, ): Promise { const info = parseGitHubUrl(url); if (!info) return null; const config = loadGitHubConfig(); if (!config.enabled) return null; const { owner, repo } = info; const key = cacheKey(owner, repo, info.ref); const cached = cloneCache.get(key); if (cached) return awaitCachedClone(cached, url, owner, repo, info, signal); if (info.refIsFullSha) { const sizeNote = `Note: Commit SHA URLs use the GitHub API instead of cloning.`; return fetchViaApi(url, owner, repo, info, sizeNote); } const activityId = activityMonitor.logStart({ type: "fetch", url: `github.com/${owner}/${repo}` }); if (!forceClone) { const sizeKB = await checkRepoSize(owner, repo); if (sizeKB !== null) { const sizeMB = sizeKB / 1024; if (sizeMB > config.maxRepoSizeMB) { activityMonitor.logComplete(activityId, 200); const sizeNote = `Note: Repository is ${Math.round(sizeMB)}MB (threshold: ${config.maxRepoSizeMB}MB). ` + `Showing API-fetched content instead of full clone. Ask the user if they'd like to clone the full repo -- ` + `if yes, call fetch_content again with the same URL and add forceClone: true to the params.`; return fetchViaApi(url, owner, repo, info, sizeNote); } } } // Re-check: another concurrent caller may have started a clone while we awaited the size check const cachedAfterSizeCheck = cloneCache.get(key); if (cachedAfterSizeCheck) return awaitCachedClone(cachedAfterSizeCheck, url, owner, repo, info, signal); const clonePromise = cloneRepo(owner, repo, info.ref, config, signal); const localPath = cloneDir(config, owner, repo, info.ref); cloneCache.set(key, { localPath, clonePromise }); const result = await clonePromise; if (!result) { cloneCache.delete(key); activityMonitor.logError(activityId, "clone failed"); const apiFallback = await fetchViaApi(url, owner, repo, info); if (apiFallback) return apiFallback; return null; } activityMonitor.logComplete(activityId, 200); const content = generateContent(result, info); const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`; return { url, title, content, error: null }; } export function clearCloneCache(): void { for (const entry of cloneCache.values()) { try { rmSync(entry.localPath, { recursive: true, force: true }); } catch { /* ignore */ } } cloneCache.clear(); cachedConfig = null; }