Re-add pi-web-access
This commit is contained in:
505
pi/files/agent/extensions/pi-web-access/github-extract.ts
Normal file
505
pi/files/agent/extensions/pi-web-access/github-extract.ts
Normal file
@@ -0,0 +1,505 @@
|
||||
import { existsSync, readFileSync, rmSync, statSync, readdirSync, openSync, readSync, closeSync } from "node:fs";
|
||||
import { execFile } from "node:child_process";
|
||||
import { homedir } from "node:os";
|
||||
import { join, extname } from "node:path";
|
||||
import { activityMonitor } from "./activity.js";
|
||||
import type { ExtractedContent } from "./extract.js";
|
||||
import { checkGhAvailable, checkRepoSize, fetchViaApi, showGhHint } from "./github-api.js";
|
||||
|
||||
const CONFIG_PATH = join(homedir(), ".pi", "web-search.json");
|
||||
|
||||
const BINARY_EXTENSIONS = new Set([
|
||||
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", ".webp", ".svg", ".tiff", ".tif",
|
||||
".mp3", ".mp4", ".avi", ".mov", ".mkv", ".flv", ".wmv", ".wav", ".ogg", ".webm", ".flac", ".aac",
|
||||
".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", ".zst",
|
||||
".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a", ".lib",
|
||||
".woff", ".woff2", ".ttf", ".otf", ".eot",
|
||||
".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx",
|
||||
".sqlite", ".db", ".sqlite3",
|
||||
".pyc", ".pyo", ".class", ".jar", ".war",
|
||||
".iso", ".img", ".dmg",
|
||||
]);
|
||||
|
||||
const NOISE_DIRS = new Set([
|
||||
"node_modules", "vendor", ".next", "dist", "build", "__pycache__",
|
||||
".venv", "venv", ".tox", ".mypy_cache", ".pytest_cache",
|
||||
"target", ".gradle", ".idea", ".vscode",
|
||||
]);
|
||||
|
||||
const MAX_INLINE_FILE_CHARS = 100_000;
|
||||
const MAX_TREE_ENTRIES = 200;
|
||||
|
||||
export interface GitHubUrlInfo {
|
||||
owner: string;
|
||||
repo: string;
|
||||
ref?: string;
|
||||
refIsFullSha: boolean;
|
||||
path?: string;
|
||||
type: "root" | "blob" | "tree";
|
||||
}
|
||||
|
||||
interface CachedClone {
|
||||
localPath: string;
|
||||
clonePromise: Promise<string | null>;
|
||||
}
|
||||
|
||||
interface GitHubCloneConfig {
|
||||
enabled: boolean;
|
||||
maxRepoSizeMB: number;
|
||||
cloneTimeoutSeconds: number;
|
||||
clonePath: string;
|
||||
}
|
||||
|
||||
const cloneCache = new Map<string, CachedClone>();
|
||||
|
||||
let cachedConfig: GitHubCloneConfig | null = null;
|
||||
|
||||
function loadGitHubConfig(): GitHubCloneConfig {
|
||||
if (cachedConfig) return cachedConfig;
|
||||
|
||||
const defaults: GitHubCloneConfig = {
|
||||
enabled: true,
|
||||
maxRepoSizeMB: 350,
|
||||
cloneTimeoutSeconds: 30,
|
||||
clonePath: "/tmp/pi-github-repos",
|
||||
};
|
||||
|
||||
try {
|
||||
if (existsSync(CONFIG_PATH)) {
|
||||
const raw = JSON.parse(readFileSync(CONFIG_PATH, "utf-8"));
|
||||
const gc = raw.githubClone ?? {};
|
||||
cachedConfig = {
|
||||
enabled: gc.enabled ?? defaults.enabled,
|
||||
maxRepoSizeMB: gc.maxRepoSizeMB ?? defaults.maxRepoSizeMB,
|
||||
cloneTimeoutSeconds: gc.cloneTimeoutSeconds ?? defaults.cloneTimeoutSeconds,
|
||||
clonePath: gc.clonePath ?? defaults.clonePath,
|
||||
};
|
||||
return cachedConfig;
|
||||
}
|
||||
} catch {
|
||||
// ignore parse errors
|
||||
}
|
||||
|
||||
cachedConfig = defaults;
|
||||
return cachedConfig;
|
||||
}
|
||||
|
||||
const NON_CODE_SEGMENTS = new Set([
|
||||
"issues", "pull", "pulls", "discussions", "releases", "wiki",
|
||||
"actions", "settings", "security", "projects", "graphs",
|
||||
"compare", "commits", "tags", "branches", "stargazers",
|
||||
"watchers", "network", "forks", "milestone", "labels",
|
||||
"packages", "codespaces", "contribute", "community",
|
||||
"sponsors", "invitations", "notifications", "insights",
|
||||
]);
|
||||
|
||||
export function parseGitHubUrl(url: string): GitHubUrlInfo | null {
|
||||
let parsed: URL;
|
||||
try {
|
||||
parsed = new URL(url);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (parsed.hostname !== "github.com") return null;
|
||||
|
||||
const segments = parsed.pathname.split("/").filter(Boolean);
|
||||
if (segments.length < 2) return null;
|
||||
|
||||
const owner = segments[0];
|
||||
const repo = segments[1].replace(/\.git$/, "");
|
||||
|
||||
if (NON_CODE_SEGMENTS.has(segments[2]?.toLowerCase())) return null;
|
||||
|
||||
if (segments.length === 2) {
|
||||
return { owner, repo, refIsFullSha: false, type: "root" };
|
||||
}
|
||||
|
||||
const action = segments[2];
|
||||
if (action !== "blob" && action !== "tree") return null;
|
||||
if (segments.length < 4) return null;
|
||||
|
||||
const ref = segments[3];
|
||||
const refIsFullSha = /^[0-9a-f]{40}$/.test(ref);
|
||||
const pathParts = segments.slice(4);
|
||||
const path = pathParts.length > 0 ? pathParts.join("/") : "";
|
||||
|
||||
return {
|
||||
owner,
|
||||
repo,
|
||||
ref,
|
||||
refIsFullSha,
|
||||
path,
|
||||
type: action as "blob" | "tree",
|
||||
};
|
||||
}
|
||||
|
||||
function cacheKey(owner: string, repo: string, ref?: string): string {
|
||||
return ref ? `${owner}/${repo}@${ref}` : `${owner}/${repo}`;
|
||||
}
|
||||
|
||||
function cloneDir(config: GitHubCloneConfig, owner: string, repo: string, ref?: string): string {
|
||||
const dirName = ref ? `${repo}@${ref}` : repo;
|
||||
return join(config.clonePath, owner, dirName);
|
||||
}
|
||||
|
||||
function execClone(args: string[], localPath: string, timeoutMs: number, signal?: AbortSignal): Promise<string | null> {
|
||||
return new Promise((resolve) => {
|
||||
const child = execFile(args[0], args.slice(1), { timeout: timeoutMs }, (err) => {
|
||||
if (err) {
|
||||
try {
|
||||
rmSync(localPath, { recursive: true, force: true });
|
||||
} catch { /* ignore */ }
|
||||
resolve(null);
|
||||
return;
|
||||
}
|
||||
resolve(localPath);
|
||||
});
|
||||
|
||||
if (signal) {
|
||||
const onAbort = () => child.kill();
|
||||
signal.addEventListener("abort", onAbort, { once: true });
|
||||
child.on("exit", () => signal.removeEventListener("abort", onAbort));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
async function cloneRepo(
|
||||
owner: string,
|
||||
repo: string,
|
||||
ref: string | undefined,
|
||||
config: GitHubCloneConfig,
|
||||
signal?: AbortSignal,
|
||||
): Promise<string | null> {
|
||||
const localPath = cloneDir(config, owner, repo, ref);
|
||||
|
||||
try {
|
||||
rmSync(localPath, { recursive: true, force: true });
|
||||
} catch { /* ignore */ }
|
||||
|
||||
const timeoutMs = config.cloneTimeoutSeconds * 1000;
|
||||
const hasGh = await checkGhAvailable();
|
||||
|
||||
if (hasGh) {
|
||||
const args = ["gh", "repo", "clone", `${owner}/${repo}`, localPath, "--", "--depth", "1", "--single-branch"];
|
||||
if (ref) args.push("--branch", ref);
|
||||
return execClone(args, localPath, timeoutMs, signal);
|
||||
}
|
||||
|
||||
showGhHint();
|
||||
|
||||
const gitUrl = `https://github.com/${owner}/${repo}.git`;
|
||||
const args = ["git", "clone", "--depth", "1", "--single-branch"];
|
||||
if (ref) args.push("--branch", ref);
|
||||
args.push(gitUrl, localPath);
|
||||
return execClone(args, localPath, timeoutMs, signal);
|
||||
}
|
||||
|
||||
function isBinaryFile(filePath: string): boolean {
|
||||
const ext = extname(filePath).toLowerCase();
|
||||
if (BINARY_EXTENSIONS.has(ext)) return true;
|
||||
|
||||
let fd: number;
|
||||
try {
|
||||
fd = openSync(filePath, "r");
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
const buf = Buffer.alloc(512);
|
||||
const bytesRead = readSync(fd, buf, 0, 512, 0);
|
||||
for (let i = 0; i < bytesRead; i++) {
|
||||
if (buf[i] === 0) return true;
|
||||
}
|
||||
} catch {
|
||||
return false;
|
||||
} finally {
|
||||
closeSync(fd);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
function formatFileSize(bytes: number): string {
|
||||
if (bytes < 1024) return `${bytes} B`;
|
||||
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
|
||||
return `${(bytes / (1024 * 1024)).toFixed(1)} MB`;
|
||||
}
|
||||
|
||||
function buildTree(rootPath: string): string {
|
||||
const entries: string[] = [];
|
||||
|
||||
function walk(dir: string, relPath: string): void {
|
||||
if (entries.length >= MAX_TREE_ENTRIES) return;
|
||||
|
||||
let items: string[];
|
||||
try {
|
||||
items = readdirSync(dir).sort();
|
||||
} catch {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const item of items) {
|
||||
if (entries.length >= MAX_TREE_ENTRIES) return;
|
||||
if (item === ".git") continue;
|
||||
|
||||
const fullPath = join(dir, item);
|
||||
let stat;
|
||||
try {
|
||||
stat = statSync(fullPath);
|
||||
} catch {
|
||||
continue;
|
||||
}
|
||||
|
||||
const rel = relPath ? `${relPath}/${item}` : item;
|
||||
|
||||
if (stat.isDirectory()) {
|
||||
if (NOISE_DIRS.has(item)) {
|
||||
entries.push(`${rel}/ [skipped]`);
|
||||
continue;
|
||||
}
|
||||
entries.push(`${rel}/`);
|
||||
walk(fullPath, rel);
|
||||
} else {
|
||||
entries.push(rel);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
walk(rootPath, "");
|
||||
|
||||
if (entries.length >= MAX_TREE_ENTRIES) {
|
||||
entries.push(`... (truncated at ${MAX_TREE_ENTRIES} entries)`);
|
||||
}
|
||||
|
||||
return entries.join("\n");
|
||||
}
|
||||
|
||||
function buildDirListing(rootPath: string, subPath: string): string {
|
||||
const targetPath = join(rootPath, subPath);
|
||||
const lines: string[] = [];
|
||||
|
||||
let items: string[];
|
||||
try {
|
||||
items = readdirSync(targetPath).sort();
|
||||
} catch {
|
||||
return "(directory not readable)";
|
||||
}
|
||||
|
||||
for (const item of items) {
|
||||
if (item === ".git") continue;
|
||||
const fullPath = join(targetPath, item);
|
||||
try {
|
||||
const stat = statSync(fullPath);
|
||||
if (stat.isDirectory()) {
|
||||
lines.push(` ${item}/`);
|
||||
} else {
|
||||
lines.push(` ${item} (${formatFileSize(stat.size)})`);
|
||||
}
|
||||
} catch {
|
||||
lines.push(` ${item} (unreadable)`);
|
||||
}
|
||||
}
|
||||
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
function readReadme(localPath: string): string | null {
|
||||
const candidates = ["README.md", "readme.md", "README", "README.txt", "README.rst"];
|
||||
for (const name of candidates) {
|
||||
const readmePath = join(localPath, name);
|
||||
if (existsSync(readmePath)) {
|
||||
try {
|
||||
const content = readFileSync(readmePath, "utf-8");
|
||||
return content.length > 8192 ? content.slice(0, 8192) + "\n\n[README truncated at 8K chars]" : content;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function generateContent(localPath: string, info: GitHubUrlInfo): string {
|
||||
const lines: string[] = [];
|
||||
lines.push(`Repository cloned to: ${localPath}`);
|
||||
lines.push("");
|
||||
|
||||
if (info.type === "root") {
|
||||
lines.push("## Structure");
|
||||
lines.push(buildTree(localPath));
|
||||
lines.push("");
|
||||
|
||||
const readme = readReadme(localPath);
|
||||
if (readme) {
|
||||
lines.push("## README.md");
|
||||
lines.push(readme);
|
||||
lines.push("");
|
||||
}
|
||||
|
||||
lines.push("Use `read` and `bash` tools at the path above to explore further.");
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
if (info.type === "tree") {
|
||||
const dirPath = info.path || "";
|
||||
const fullDirPath = join(localPath, dirPath);
|
||||
|
||||
if (!existsSync(fullDirPath)) {
|
||||
lines.push(`Path \`${dirPath}\` not found in clone. Showing repository root instead.`);
|
||||
lines.push("");
|
||||
lines.push("## Structure");
|
||||
lines.push(buildTree(localPath));
|
||||
} else {
|
||||
lines.push(`## ${dirPath || "/"}`);
|
||||
lines.push(buildDirListing(localPath, dirPath));
|
||||
}
|
||||
|
||||
lines.push("");
|
||||
lines.push("Use `read` and `bash` tools at the path above to explore further.");
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
if (info.type === "blob") {
|
||||
const filePath = info.path || "";
|
||||
const fullFilePath = join(localPath, filePath);
|
||||
|
||||
if (!existsSync(fullFilePath)) {
|
||||
lines.push(`Path \`${filePath}\` not found in clone. Showing repository root instead.`);
|
||||
lines.push("");
|
||||
lines.push("## Structure");
|
||||
lines.push(buildTree(localPath));
|
||||
lines.push("");
|
||||
lines.push("Use `read` and `bash` tools at the path above to explore further.");
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
const stat = statSync(fullFilePath);
|
||||
|
||||
if (stat.isDirectory()) {
|
||||
lines.push(`## ${filePath || "/"}`);
|
||||
lines.push(buildDirListing(localPath, filePath));
|
||||
lines.push("");
|
||||
lines.push("Use `read` and `bash` tools at the path above to explore further.");
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
if (isBinaryFile(fullFilePath)) {
|
||||
const ext = extname(filePath).replace(".", "");
|
||||
lines.push(`## ${filePath}`);
|
||||
lines.push(`Binary file (${ext}, ${formatFileSize(stat.size)}). Use \`read\` or \`bash\` tools at the path above to inspect.`);
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
const content = readFileSync(fullFilePath, "utf-8");
|
||||
lines.push(`## ${filePath}`);
|
||||
|
||||
if (content.length > MAX_INLINE_FILE_CHARS) {
|
||||
lines.push(content.slice(0, MAX_INLINE_FILE_CHARS));
|
||||
lines.push("");
|
||||
lines.push(`[File truncated at 100K chars. Full file: ${fullFilePath}]`);
|
||||
} else {
|
||||
lines.push(content);
|
||||
}
|
||||
|
||||
lines.push("");
|
||||
lines.push("Use `read` and `bash` tools at the path above to explore further.");
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
async function awaitCachedClone(
|
||||
cached: CachedClone,
|
||||
url: string,
|
||||
owner: string,
|
||||
repo: string,
|
||||
info: GitHubUrlInfo,
|
||||
signal?: AbortSignal,
|
||||
): Promise<ExtractedContent | null> {
|
||||
if (signal?.aborted) return fetchViaApi(url, owner, repo, info);
|
||||
const result = await cached.clonePromise;
|
||||
if (signal?.aborted) return fetchViaApi(url, owner, repo, info);
|
||||
if (result) {
|
||||
const content = generateContent(result, info);
|
||||
const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`;
|
||||
return { url, title, content, error: null };
|
||||
}
|
||||
return fetchViaApi(url, owner, repo, info);
|
||||
}
|
||||
|
||||
export async function extractGitHub(
|
||||
url: string,
|
||||
signal?: AbortSignal,
|
||||
forceClone?: boolean,
|
||||
): Promise<ExtractedContent | null> {
|
||||
const info = parseGitHubUrl(url);
|
||||
if (!info) return null;
|
||||
|
||||
const config = loadGitHubConfig();
|
||||
if (!config.enabled) return null;
|
||||
|
||||
const { owner, repo } = info;
|
||||
const key = cacheKey(owner, repo, info.ref);
|
||||
|
||||
const cached = cloneCache.get(key);
|
||||
if (cached) return awaitCachedClone(cached, url, owner, repo, info, signal);
|
||||
|
||||
if (info.refIsFullSha) {
|
||||
const sizeNote = `Note: Commit SHA URLs use the GitHub API instead of cloning.`;
|
||||
return fetchViaApi(url, owner, repo, info, sizeNote);
|
||||
}
|
||||
|
||||
const activityId = activityMonitor.logStart({ type: "fetch", url: `github.com/${owner}/${repo}` });
|
||||
|
||||
if (!forceClone) {
|
||||
const sizeKB = await checkRepoSize(owner, repo);
|
||||
if (sizeKB !== null) {
|
||||
const sizeMB = sizeKB / 1024;
|
||||
if (sizeMB > config.maxRepoSizeMB) {
|
||||
activityMonitor.logComplete(activityId, 200);
|
||||
const sizeNote =
|
||||
`Note: Repository is ${Math.round(sizeMB)}MB (threshold: ${config.maxRepoSizeMB}MB). ` +
|
||||
`Showing API-fetched content instead of full clone. Ask the user if they'd like to clone the full repo -- ` +
|
||||
`if yes, call fetch_content again with the same URL and add forceClone: true to the params.`;
|
||||
return fetchViaApi(url, owner, repo, info, sizeNote);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Re-check: another concurrent caller may have started a clone while we awaited the size check
|
||||
const cachedAfterSizeCheck = cloneCache.get(key);
|
||||
if (cachedAfterSizeCheck) return awaitCachedClone(cachedAfterSizeCheck, url, owner, repo, info, signal);
|
||||
|
||||
const clonePromise = cloneRepo(owner, repo, info.ref, config, signal);
|
||||
const localPath = cloneDir(config, owner, repo, info.ref);
|
||||
cloneCache.set(key, { localPath, clonePromise });
|
||||
|
||||
const result = await clonePromise;
|
||||
|
||||
if (!result) {
|
||||
cloneCache.delete(key);
|
||||
activityMonitor.logError(activityId, "clone failed");
|
||||
|
||||
const apiFallback = await fetchViaApi(url, owner, repo, info);
|
||||
if (apiFallback) return apiFallback;
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
activityMonitor.logComplete(activityId, 200);
|
||||
const content = generateContent(result, info);
|
||||
const title = info.path ? `${owner}/${repo} - ${info.path}` : `${owner}/${repo}`;
|
||||
return { url, title, content, error: null };
|
||||
}
|
||||
|
||||
export function clearCloneCache(): void {
|
||||
for (const entry of cloneCache.values()) {
|
||||
try {
|
||||
rmSync(entry.localPath, { recursive: true, force: true });
|
||||
} catch { /* ignore */ }
|
||||
}
|
||||
cloneCache.clear();
|
||||
cachedConfig = null;
|
||||
}
|
||||
Reference in New Issue
Block a user