Re-add pi-web-access

2026-02-19 22:23:48 +00:00
parent c242a0ca53
commit 774492f279
31 changed files with 8666 additions and 1 deletions
--- a/pi/files/agent/extensions/pi-web-access/extract.ts
+++ b/pi/files/agent/extensions/pi-web-access/extract.ts
@@ -0,0 +1,560 @@
+import { Readability } from "@mozilla/readability";
+import { parseHTML } from "linkedom";
+import TurndownService from "turndown";
+import pLimit from "p-limit";
+import { activityMonitor } from "./activity.js";
+import { extractRSCContent } from "./rsc-extract.js";
+import { extractPDFToMarkdown, isPDF } from "./pdf-extract.js";
+import { extractGitHub } from "./github-extract.js";
+import { isYouTubeURL, isYouTubeEnabled, extractYouTube, extractYouTubeFrame, extractYouTubeFrames, getYouTubeStreamInfo } from "./youtube-extract.js";
+import { extractWithUrlContext, extractWithGeminiWeb } from "./gemini-url-context.js";
+import { isVideoFile, extractVideo, extractVideoFrame, getLocalVideoDuration } from "./video-extract.js";
+import { formatSeconds } from "./utils.js";
+
+const DEFAULT_TIMEOUT_MS = 30000;
+const CONCURRENT_LIMIT = 3;
+
+const NON_RECOVERABLE_ERRORS = ["Unsupported content type", "Response too large"];
+const MIN_USEFUL_CONTENT = 500;
+
+const turndown = new TurndownService({
+	headingStyle: "atx",
+	codeBlockStyle: "fenced",
+});
+
+const fetchLimit = pLimit(CONCURRENT_LIMIT);
+
+export interface VideoFrame {
+	data: string;
+	mimeType: string;
+	timestamp: string;
+}
+
+export type FrameData = { data: string; mimeType: string };
+export type FrameResult = FrameData | { error: string };
+
+export interface ExtractedContent {
+	url: string;
+	title: string;
+	content: string;
+	error: string | null;
+	thumbnail?: { data: string; mimeType: string };
+	frames?: VideoFrame[];
+	duration?: number;
+}
+
+export interface ExtractOptions {
+	timeoutMs?: number;
+	forceClone?: boolean;
+	prompt?: string;
+	timestamp?: string;
+	frames?: number;
+	model?: string;
+}
+
+const JINA_READER_BASE = "https://r.jina.ai/";
+const JINA_TIMEOUT_MS = 30000;
+
+async function extractWithJinaReader(
+	url: string,
+	signal?: AbortSignal,
+): Promise<ExtractedContent | null> {
+	const jinaUrl = JINA_READER_BASE + url;
+
+	const activityId = activityMonitor.logStart({ type: "api", query: `jina: ${url}` });
+
+	try {
+		const res = await fetch(jinaUrl, {
+			headers: {
+				"Accept": "text/markdown",
+				"X-No-Cache": "true",
+			},
+			signal: AbortSignal.any([
+				AbortSignal.timeout(JINA_TIMEOUT_MS),
+				...(signal ? [signal] : []),
+			]),
+		});
+
+		if (!res.ok) {
+			activityMonitor.logComplete(activityId, res.status);
+			return null;
+		}
+
+		const content = await res.text();
+		activityMonitor.logComplete(activityId, res.status);
+
+		const contentStart = content.indexOf("Markdown Content:");
+		if (contentStart < 0) {
+			return null;
+		}
+
+		const markdownPart = content.slice(contentStart + 17).trim(); // 17 = "Markdown Content:".length
+
+		// Check for failed JS rendering or minimal content
+		if (markdownPart.length < 100 ||
+			markdownPart.startsWith("Loading...") ||
+			markdownPart.startsWith("Please enable JavaScript")) {
+			return null;
+		}
+
+		const title = extractHeadingTitle(markdownPart) ?? (new URL(url).pathname.split("/").pop() || url);
+		return { url, title, content: markdownPart, error: null };
+	} catch (err) {
+		const message = err instanceof Error ? err.message : String(err);
+		if (message.toLowerCase().includes("abort")) {
+			activityMonitor.logComplete(activityId, 0);
+		} else {
+			activityMonitor.logError(activityId, message);
+		}
+		return null;
+	}
+}
+
+function parseTimestamp(ts: string): number | null {
+	const num = Number(ts);
+	if (!isNaN(num) && num >= 0) return Math.floor(num);
+	const parts = ts.split(":").map(Number);
+	if (parts.some(p => isNaN(p) || p < 0)) return null;
+	if (parts.length === 3) return Math.floor(parts[0] * 3600 + parts[1] * 60 + parts[2]);
+	if (parts.length === 2) return Math.floor(parts[0] * 60 + parts[1]);
+	return null;
+}
+
+type TimestampSpec = { type: "single"; seconds: number } | { type: "range"; start: number; end: number };
+
+function parseTimestampSpec(ts: string): TimestampSpec | null {
+	const dashIdx = ts.indexOf("-", 1);
+	if (dashIdx > 0) {
+		const start = parseTimestamp(ts.slice(0, dashIdx));
+		const end = parseTimestamp(ts.slice(dashIdx + 1));
+		if (start !== null && end !== null && end > start) return { type: "range", start, end };
+	}
+	const seconds = parseTimestamp(ts);
+	return seconds !== null ? { type: "single", seconds } : null;
+}
+
+const DEFAULT_RANGE_FRAMES = 6;
+const MIN_FRAME_INTERVAL = 5;
+
+function computeRangeTimestamps(start: number, end: number, maxFrames: number = DEFAULT_RANGE_FRAMES): number[] {
+	if (maxFrames <= 1) return [start];
+	const duration = end - start;
+	const idealInterval = duration / (maxFrames - 1);
+	if (idealInterval < MIN_FRAME_INTERVAL) {
+		const timestamps: number[] = [];
+		for (let t = start; t <= end && timestamps.length < maxFrames; t += MIN_FRAME_INTERVAL) {
+			timestamps.push(t);
+		}
+		return timestamps;
+	}
+	return Array.from({ length: maxFrames }, (_, i) => Math.round(start + i * idealInterval));
+}
+
+function buildFrameResult(
+	url: string, label: string, requestedCount: number,
+	frames: VideoFrame[], error: string | null, duration?: number,
+): ExtractedContent {
+	if (frames.length === 0) {
+		const msg = error ?? "Frame extraction failed";
+		return { url, title: `Frames ${label} (0/${requestedCount})`, content: msg, error: msg };
+	}
+	return {
+		url,
+		title: `Frames ${label} (${frames.length}/${requestedCount})`,
+		content: `${frames.length} frames extracted from ${label}`,
+		error: null,
+		frames,
+		duration,
+	};
+}
+
+async function extractLocalFrames(
+	filePath: string, timestamps: number[],
+): Promise<{ frames: VideoFrame[]; error: string | null }> {
+	const results = await Promise.all(timestamps.map(async (t) => {
+		const frame = await extractVideoFrame(filePath, t);
+		if ("error" in frame) return { error: frame.error };
+		return { ...frame, timestamp: formatSeconds(t) };
+	}));
+	const frames = results.filter((f): f is VideoFrame => "data" in f);
+	const firstError = results.find((f): f is { error: string } => "error" in f);
+	return { frames, error: frames.length === 0 && firstError ? firstError.error : null };
+}
+
+export async function extractContent(
+	url: string,
+	signal?: AbortSignal,
+	options?: ExtractOptions,
+): Promise<ExtractedContent> {
+	if (signal?.aborted) {
+		return { url, title: "", content: "", error: "Aborted" };
+	}
+
+	if (options?.frames && !options.timestamp) {
+		const frameCount = options.frames;
+		const ytInfo = isYouTubeURL(url);
+		if (ytInfo.isYouTube && ytInfo.videoId) {
+			const streamInfo = await getYouTubeStreamInfo(ytInfo.videoId);
+			if ("error" in streamInfo) {
+				return { url, title: "Frames", content: streamInfo.error, error: streamInfo.error };
+			}
+			if (streamInfo.duration === null) {
+				const error = "Cannot determine video duration. Use a timestamp range instead.";
+				return { url, title: "Frames", content: error, error };
+			}
+			const dur = Math.floor(streamInfo.duration);
+			const timestamps = computeRangeTimestamps(0, dur, frameCount);
+			const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
+			const label = `${formatSeconds(0)}-${formatSeconds(dur)}`;
+			return buildFrameResult(url, label, timestamps.length, result.frames, result.error, streamInfo.duration);
+		}
+
+		const videoInfo = isVideoFile(url);
+		if (videoInfo) {
+			const durationResult = await getLocalVideoDuration(videoInfo.absolutePath);
+			if (typeof durationResult !== "number") {
+				return { url, title: "Frames", content: durationResult.error, error: durationResult.error };
+			}
+			const dur = Math.floor(durationResult);
+			const timestamps = computeRangeTimestamps(0, dur, frameCount);
+			const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
+			const label = `${formatSeconds(0)}-${formatSeconds(dur)}`;
+			return buildFrameResult(url, label, timestamps.length, result.frames, result.error, durationResult);
+		}
+
+		return { url, title: "", content: "", error: "Frame extraction only works with YouTube and local video files" };
+	}
+
+	if (options?.timestamp) {
+		const spec = parseTimestampSpec(options.timestamp);
+		if (spec) {
+			const frameCount = options.frames;
+			const ytInfo = isYouTubeURL(url);
+			if (ytInfo.isYouTube && ytInfo.videoId) {
+				const streamInfo = await getYouTubeStreamInfo(ytInfo.videoId);
+				if ("error" in streamInfo) {
+					if (spec.type === "range") {
+						const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
+						return { url, title: `Frames ${label}`, content: streamInfo.error, error: streamInfo.error };
+					}
+					if (frameCount) {
+						const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
+						const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
+						return { url, title: `Frames ${label}`, content: streamInfo.error, error: streamInfo.error };
+					}
+					return { url, title: `Frame at ${options.timestamp}`, content: streamInfo.error, error: streamInfo.error };
+				}
+
+				if (spec.type === "range") {
+					const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
+					if (streamInfo.duration !== null && spec.end > streamInfo.duration) {
+						const error = `Timestamp ${formatSeconds(spec.end)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
+						return { url, title: `Frames ${label}`, content: error, error };
+					}
+					const timestamps = frameCount
+						? computeRangeTimestamps(spec.start, spec.end, frameCount)
+						: computeRangeTimestamps(spec.start, spec.end);
+					const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
+					return buildFrameResult(url, label, timestamps.length, result.frames, result.error, result.duration ?? undefined);
+				}
+
+				if (frameCount) {
+					const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
+					const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
+					if (streamInfo.duration !== null && end > streamInfo.duration) {
+						const error = `Timestamp ${formatSeconds(end)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
+						return { url, title: `Frames ${label}`, content: error, error };
+					}
+					const timestamps = computeRangeTimestamps(spec.seconds, end, frameCount);
+					const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
+					return buildFrameResult(url, label, timestamps.length, result.frames, result.error, result.duration ?? undefined);
+				}
+
+				if (streamInfo.duration !== null && spec.seconds > streamInfo.duration) {
+					const error = `Timestamp ${formatSeconds(spec.seconds)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
+					return { url, title: `Frame at ${options.timestamp}`, content: error, error };
+				}
+				const frame = await extractYouTubeFrame(ytInfo.videoId, spec.seconds, streamInfo);
+				if ("error" in frame) {
+					return { url, title: `Frame at ${options.timestamp}`, content: frame.error, error: frame.error };
+				}
+				return { url, title: `Frame at ${options.timestamp}`, content: `Video frame at ${options.timestamp}`, error: null, thumbnail: frame };
+			}
+
+			const videoInfo = isVideoFile(url);
+			if (videoInfo) {
+				if (spec.type === "range") {
+					const timestamps = frameCount
+						? computeRangeTimestamps(spec.start, spec.end, frameCount)
+						: computeRangeTimestamps(spec.start, spec.end);
+					const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
+					const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
+					return buildFrameResult(url, label, timestamps.length, result.frames, result.error);
+				}
+
+				if (frameCount) {
+					const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
+					const timestamps = computeRangeTimestamps(spec.seconds, end, frameCount);
+					const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
+					const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
+					return buildFrameResult(url, label, timestamps.length, result.frames, result.error);
+				}
+
+				const frame = await extractVideoFrame(videoInfo.absolutePath, spec.seconds);
+				if ("error" in frame) {
+					return { url, title: `Frame at ${options.timestamp}`, content: frame.error, error: frame.error };
+				}
+				return { url, title: `Frame at ${options.timestamp}`, content: `Video frame at ${options.timestamp}`, error: null, thumbnail: frame };
+			}
+		}
+	}
+
+	const videoInfo = isVideoFile(url);
+	if (videoInfo) {
+		const result = await extractVideo(videoInfo, signal, options);
+		return result ?? { url, title: "", content: "", error: "Video analysis requires Gemini access. Either:\n  1. Sign into gemini.google.com in Chrome (free, uses cookies)\n  2. Set GEMINI_API_KEY in ~/.pi/web-search.json" };
+	}
+
+	try {
+		new URL(url);
+	} catch {
+		return { url, title: "", content: "", error: "Invalid URL" };
+	}
+
+	try {
+		const ghResult = await extractGitHub(url, signal, options?.forceClone);
+		if (ghResult) return ghResult;
+	} catch {}
+
+	const ytInfo = isYouTubeURL(url);
+	if (ytInfo.isYouTube && isYouTubeEnabled()) {
+		try {
+			const ytResult = await extractYouTube(url, signal, options?.prompt, options?.model);
+			if (ytResult) return ytResult;
+		} catch {}
+		return {
+			url,
+			title: "",
+			content: "",
+			error: "Could not extract YouTube video content. Sign into Google in Chrome for automatic access, or set GEMINI_API_KEY.",
+		};
+	}
+
+	const httpResult = await extractViaHttp(url, signal, options);
+
+	if (!httpResult.error || signal?.aborted) return httpResult;
+	if (NON_RECOVERABLE_ERRORS.some(prefix => httpResult.error!.startsWith(prefix))) return httpResult;
+
+	const jinaResult = await extractWithJinaReader(url, signal);
+	if (jinaResult) return jinaResult;
+
+	const geminiResult = await extractWithUrlContext(url, signal)
+		?? await extractWithGeminiWeb(url, signal);
+
+	if (geminiResult) return geminiResult;
+
+	const guidance = [
+		httpResult.error,
+		"",
+		"Fallback options:",
+		"  \u2022 Set GEMINI_API_KEY in ~/.pi/web-search.json",
+		"  \u2022 Sign into gemini.google.com in Chrome",
+		"  \u2022 Use web_search to find content about this topic",
+	].join("\n");
+	return { ...httpResult, error: guidance };
+}
+
+function isLikelyJSRendered(html: string): boolean {
+	// Extract body content
+	const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
+	if (!bodyMatch) return false;
+
+	const bodyHtml = bodyMatch[1];
+
+	// Strip tags to get text content
+	const textContent = bodyHtml
+		.replace(/<script[\s\S]*?<\/script>/gi, "")
+		.replace(/<style[\s\S]*?<\/style>/gi, "")
+		.replace(/<[^>]+>/g, "")
+		.replace(/\s+/g, " ")
+		.trim();
+
+	// Count scripts
+	const scriptCount = (html.match(/<script/gi) || []).length;
+
+	// Heuristic: little text content but many scripts suggests JS rendering
+	return textContent.length < 500 && scriptCount > 3;
+}
+
+async function extractViaHttp(
+	url: string,
+	signal?: AbortSignal,
+	options?: ExtractOptions,
+): Promise<ExtractedContent> {
+	const timeoutMs = options?.timeoutMs ?? DEFAULT_TIMEOUT_MS;
+	const activityId = activityMonitor.logStart({ type: "fetch", url });
+
+	const controller = new AbortController();
+	const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
+
+	const onAbort = () => controller.abort();
+	signal?.addEventListener("abort", onAbort);
+
+	try {
+		const response = await fetch(url, {
+			signal: controller.signal,
+			headers: {
+				"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+				"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+				"Accept-Language": "en-US,en;q=0.9",
+				"Cache-Control": "no-cache",
+				"Sec-Fetch-Dest": "document",
+				"Sec-Fetch-Mode": "navigate",
+				"Sec-Fetch-Site": "none",
+				"Sec-Fetch-User": "?1",
+				"Upgrade-Insecure-Requests": "1",
+			},
+		});
+
+		if (!response.ok) {
+			activityMonitor.logComplete(activityId, response.status);
+			return {
+				url,
+				title: "",
+				content: "",
+				error: `HTTP ${response.status}: ${response.statusText}`,
+			};
+		}
+
+		const contentLengthHeader = response.headers.get("content-length");
+		const contentType = response.headers.get("content-type") || "";
+		const isPDFContent = isPDF(url, contentType);
+		const maxResponseSize = isPDFContent ? 20 * 1024 * 1024 : 5 * 1024 * 1024;
+		if (contentLengthHeader) {
+			const contentLength = parseInt(contentLengthHeader, 10);
+			if (contentLength > maxResponseSize) {
+				activityMonitor.logComplete(activityId, response.status);
+				return {
+					url,
+					title: "",
+					content: "",
+					error: `Response too large (${Math.round(contentLength / 1024 / 1024)}MB)`,
+				};
+			}
+		}
+
+		if (isPDFContent) {
+			try {
+				const buffer = await response.arrayBuffer();
+				const result = await extractPDFToMarkdown(buffer, url);
+				activityMonitor.logComplete(activityId, response.status);
+				return {
+					url,
+					title: result.title,
+					content: `PDF extracted and saved to: ${result.outputPath}\n\nPages: ${result.pages}\nCharacters: ${result.chars}`,
+					error: null,
+				};
+			} catch (err) {
+				const message = err instanceof Error ? err.message : String(err);
+				activityMonitor.logError(activityId, message);
+				return { url, title: "", content: "", error: `PDF extraction failed: ${message}` };
+			}
+		}
+
+		if (contentType.includes("application/octet-stream") ||
+			contentType.includes("image/") ||
+			contentType.includes("audio/") ||
+			contentType.includes("video/") ||
+			contentType.includes("application/zip")) {
+			activityMonitor.logComplete(activityId, response.status);
+			return {
+				url,
+				title: "",
+				content: "",
+				error: `Unsupported content type: ${contentType.split(";")[0]}`,
+			};
+		}
+
+		const text = await response.text();
+		const isHTML = contentType.includes("text/html") || contentType.includes("application/xhtml+xml");
+
+		if (!isHTML) {
+			activityMonitor.logComplete(activityId, response.status);
+			const title = extractTextTitle(text, url);
+			return { url, title, content: text, error: null };
+		}
+
+		const { document } = parseHTML(text);
+		const reader = new Readability(document as unknown as Document);
+		const article = reader.parse();
+
+		if (!article) {
+			const rscResult = extractRSCContent(text);
+			if (rscResult) {
+				activityMonitor.logComplete(activityId, response.status);
+				return { url, title: rscResult.title, content: rscResult.content, error: null };
+			}
+
+			activityMonitor.logComplete(activityId, response.status);
+
+			// Provide more specific error message
+			const jsRendered = isLikelyJSRendered(text);
+			const errorMsg = jsRendered
+				? "Page appears to be JavaScript-rendered (content loads dynamically)"
+				: "Could not extract readable content from HTML structure";
+
+			return {
+				url,
+				title: "",
+				content: "",
+				error: errorMsg,
+			};
+		}
+
+		const markdown = turndown.turndown(article.content);
+		activityMonitor.logComplete(activityId, response.status);
+
+		if (markdown.length < MIN_USEFUL_CONTENT) {
+			return {
+				url,
+				title: article.title || "",
+				content: markdown,
+				error: isLikelyJSRendered(text)
+					? "Page appears to be JavaScript-rendered (content loads dynamically)"
+					: "Extracted content appears incomplete",
+			};
+		}
+
+		return { url, title: article.title || "", content: markdown, error: null };
+	} catch (err) {
+		const message = err instanceof Error ? err.message : String(err);
+		if (message.toLowerCase().includes("abort")) {
+			activityMonitor.logComplete(activityId, 0);
+		} else {
+			activityMonitor.logError(activityId, message);
+		}
+		return { url, title: "", content: "", error: message };
+	} finally {
+		clearTimeout(timeoutId);
+		signal?.removeEventListener("abort", onAbort);
+	}
+}
+
+export function extractHeadingTitle(text: string): string | null {
+	const match = text.match(/^#{1,2}\s+(.+)/m);
+	if (!match) return null;
+	const cleaned = match[1].replace(/\*+/g, "").trim();
+	return cleaned || null;
+}
+
+function extractTextTitle(text: string, url: string): string {
+	return extractHeadingTitle(text) ?? (new URL(url).pathname.split("/").pop() || url);
+}
+
+export async function fetchAllContent(
+	urls: string[],
+	signal?: AbortSignal,
+	options?: ExtractOptions,
+): Promise<ExtractedContent[]> {
+	return Promise.all(urls.map((url) => fetchLimit(() => extractContent(url, signal, options))));
+}