Files
dotfiles/pi/files/agent/extensions/pi-web-access/extract.ts
2026-02-19 22:23:48 +00:00

561 lines
20 KiB
TypeScript

import { Readability } from "@mozilla/readability";
import { parseHTML } from "linkedom";
import TurndownService from "turndown";
import pLimit from "p-limit";
import { activityMonitor } from "./activity.js";
import { extractRSCContent } from "./rsc-extract.js";
import { extractPDFToMarkdown, isPDF } from "./pdf-extract.js";
import { extractGitHub } from "./github-extract.js";
import { isYouTubeURL, isYouTubeEnabled, extractYouTube, extractYouTubeFrame, extractYouTubeFrames, getYouTubeStreamInfo } from "./youtube-extract.js";
import { extractWithUrlContext, extractWithGeminiWeb } from "./gemini-url-context.js";
import { isVideoFile, extractVideo, extractVideoFrame, getLocalVideoDuration } from "./video-extract.js";
import { formatSeconds } from "./utils.js";
const DEFAULT_TIMEOUT_MS = 30000;
const CONCURRENT_LIMIT = 3;
const NON_RECOVERABLE_ERRORS = ["Unsupported content type", "Response too large"];
const MIN_USEFUL_CONTENT = 500;
const turndown = new TurndownService({
headingStyle: "atx",
codeBlockStyle: "fenced",
});
const fetchLimit = pLimit(CONCURRENT_LIMIT);
export interface VideoFrame {
data: string;
mimeType: string;
timestamp: string;
}
export type FrameData = { data: string; mimeType: string };
export type FrameResult = FrameData | { error: string };
export interface ExtractedContent {
url: string;
title: string;
content: string;
error: string | null;
thumbnail?: { data: string; mimeType: string };
frames?: VideoFrame[];
duration?: number;
}
export interface ExtractOptions {
timeoutMs?: number;
forceClone?: boolean;
prompt?: string;
timestamp?: string;
frames?: number;
model?: string;
}
const JINA_READER_BASE = "https://r.jina.ai/";
const JINA_TIMEOUT_MS = 30000;
async function extractWithJinaReader(
url: string,
signal?: AbortSignal,
): Promise<ExtractedContent | null> {
const jinaUrl = JINA_READER_BASE + url;
const activityId = activityMonitor.logStart({ type: "api", query: `jina: ${url}` });
try {
const res = await fetch(jinaUrl, {
headers: {
"Accept": "text/markdown",
"X-No-Cache": "true",
},
signal: AbortSignal.any([
AbortSignal.timeout(JINA_TIMEOUT_MS),
...(signal ? [signal] : []),
]),
});
if (!res.ok) {
activityMonitor.logComplete(activityId, res.status);
return null;
}
const content = await res.text();
activityMonitor.logComplete(activityId, res.status);
const contentStart = content.indexOf("Markdown Content:");
if (contentStart < 0) {
return null;
}
const markdownPart = content.slice(contentStart + 17).trim(); // 17 = "Markdown Content:".length
// Check for failed JS rendering or minimal content
if (markdownPart.length < 100 ||
markdownPart.startsWith("Loading...") ||
markdownPart.startsWith("Please enable JavaScript")) {
return null;
}
const title = extractHeadingTitle(markdownPart) ?? (new URL(url).pathname.split("/").pop() || url);
return { url, title, content: markdownPart, error: null };
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
if (message.toLowerCase().includes("abort")) {
activityMonitor.logComplete(activityId, 0);
} else {
activityMonitor.logError(activityId, message);
}
return null;
}
}
function parseTimestamp(ts: string): number | null {
const num = Number(ts);
if (!isNaN(num) && num >= 0) return Math.floor(num);
const parts = ts.split(":").map(Number);
if (parts.some(p => isNaN(p) || p < 0)) return null;
if (parts.length === 3) return Math.floor(parts[0] * 3600 + parts[1] * 60 + parts[2]);
if (parts.length === 2) return Math.floor(parts[0] * 60 + parts[1]);
return null;
}
type TimestampSpec = { type: "single"; seconds: number } | { type: "range"; start: number; end: number };
function parseTimestampSpec(ts: string): TimestampSpec | null {
const dashIdx = ts.indexOf("-", 1);
if (dashIdx > 0) {
const start = parseTimestamp(ts.slice(0, dashIdx));
const end = parseTimestamp(ts.slice(dashIdx + 1));
if (start !== null && end !== null && end > start) return { type: "range", start, end };
}
const seconds = parseTimestamp(ts);
return seconds !== null ? { type: "single", seconds } : null;
}
const DEFAULT_RANGE_FRAMES = 6;
const MIN_FRAME_INTERVAL = 5;
function computeRangeTimestamps(start: number, end: number, maxFrames: number = DEFAULT_RANGE_FRAMES): number[] {
if (maxFrames <= 1) return [start];
const duration = end - start;
const idealInterval = duration / (maxFrames - 1);
if (idealInterval < MIN_FRAME_INTERVAL) {
const timestamps: number[] = [];
for (let t = start; t <= end && timestamps.length < maxFrames; t += MIN_FRAME_INTERVAL) {
timestamps.push(t);
}
return timestamps;
}
return Array.from({ length: maxFrames }, (_, i) => Math.round(start + i * idealInterval));
}
function buildFrameResult(
url: string, label: string, requestedCount: number,
frames: VideoFrame[], error: string | null, duration?: number,
): ExtractedContent {
if (frames.length === 0) {
const msg = error ?? "Frame extraction failed";
return { url, title: `Frames ${label} (0/${requestedCount})`, content: msg, error: msg };
}
return {
url,
title: `Frames ${label} (${frames.length}/${requestedCount})`,
content: `${frames.length} frames extracted from ${label}`,
error: null,
frames,
duration,
};
}
async function extractLocalFrames(
filePath: string, timestamps: number[],
): Promise<{ frames: VideoFrame[]; error: string | null }> {
const results = await Promise.all(timestamps.map(async (t) => {
const frame = await extractVideoFrame(filePath, t);
if ("error" in frame) return { error: frame.error };
return { ...frame, timestamp: formatSeconds(t) };
}));
const frames = results.filter((f): f is VideoFrame => "data" in f);
const firstError = results.find((f): f is { error: string } => "error" in f);
return { frames, error: frames.length === 0 && firstError ? firstError.error : null };
}
export async function extractContent(
url: string,
signal?: AbortSignal,
options?: ExtractOptions,
): Promise<ExtractedContent> {
if (signal?.aborted) {
return { url, title: "", content: "", error: "Aborted" };
}
if (options?.frames && !options.timestamp) {
const frameCount = options.frames;
const ytInfo = isYouTubeURL(url);
if (ytInfo.isYouTube && ytInfo.videoId) {
const streamInfo = await getYouTubeStreamInfo(ytInfo.videoId);
if ("error" in streamInfo) {
return { url, title: "Frames", content: streamInfo.error, error: streamInfo.error };
}
if (streamInfo.duration === null) {
const error = "Cannot determine video duration. Use a timestamp range instead.";
return { url, title: "Frames", content: error, error };
}
const dur = Math.floor(streamInfo.duration);
const timestamps = computeRangeTimestamps(0, dur, frameCount);
const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
const label = `${formatSeconds(0)}-${formatSeconds(dur)}`;
return buildFrameResult(url, label, timestamps.length, result.frames, result.error, streamInfo.duration);
}
const videoInfo = isVideoFile(url);
if (videoInfo) {
const durationResult = await getLocalVideoDuration(videoInfo.absolutePath);
if (typeof durationResult !== "number") {
return { url, title: "Frames", content: durationResult.error, error: durationResult.error };
}
const dur = Math.floor(durationResult);
const timestamps = computeRangeTimestamps(0, dur, frameCount);
const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
const label = `${formatSeconds(0)}-${formatSeconds(dur)}`;
return buildFrameResult(url, label, timestamps.length, result.frames, result.error, durationResult);
}
return { url, title: "", content: "", error: "Frame extraction only works with YouTube and local video files" };
}
if (options?.timestamp) {
const spec = parseTimestampSpec(options.timestamp);
if (spec) {
const frameCount = options.frames;
const ytInfo = isYouTubeURL(url);
if (ytInfo.isYouTube && ytInfo.videoId) {
const streamInfo = await getYouTubeStreamInfo(ytInfo.videoId);
if ("error" in streamInfo) {
if (spec.type === "range") {
const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
return { url, title: `Frames ${label}`, content: streamInfo.error, error: streamInfo.error };
}
if (frameCount) {
const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
return { url, title: `Frames ${label}`, content: streamInfo.error, error: streamInfo.error };
}
return { url, title: `Frame at ${options.timestamp}`, content: streamInfo.error, error: streamInfo.error };
}
if (spec.type === "range") {
const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
if (streamInfo.duration !== null && spec.end > streamInfo.duration) {
const error = `Timestamp ${formatSeconds(spec.end)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
return { url, title: `Frames ${label}`, content: error, error };
}
const timestamps = frameCount
? computeRangeTimestamps(spec.start, spec.end, frameCount)
: computeRangeTimestamps(spec.start, spec.end);
const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
return buildFrameResult(url, label, timestamps.length, result.frames, result.error, result.duration ?? undefined);
}
if (frameCount) {
const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
if (streamInfo.duration !== null && end > streamInfo.duration) {
const error = `Timestamp ${formatSeconds(end)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
return { url, title: `Frames ${label}`, content: error, error };
}
const timestamps = computeRangeTimestamps(spec.seconds, end, frameCount);
const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
return buildFrameResult(url, label, timestamps.length, result.frames, result.error, result.duration ?? undefined);
}
if (streamInfo.duration !== null && spec.seconds > streamInfo.duration) {
const error = `Timestamp ${formatSeconds(spec.seconds)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
return { url, title: `Frame at ${options.timestamp}`, content: error, error };
}
const frame = await extractYouTubeFrame(ytInfo.videoId, spec.seconds, streamInfo);
if ("error" in frame) {
return { url, title: `Frame at ${options.timestamp}`, content: frame.error, error: frame.error };
}
return { url, title: `Frame at ${options.timestamp}`, content: `Video frame at ${options.timestamp}`, error: null, thumbnail: frame };
}
const videoInfo = isVideoFile(url);
if (videoInfo) {
if (spec.type === "range") {
const timestamps = frameCount
? computeRangeTimestamps(spec.start, spec.end, frameCount)
: computeRangeTimestamps(spec.start, spec.end);
const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
return buildFrameResult(url, label, timestamps.length, result.frames, result.error);
}
if (frameCount) {
const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
const timestamps = computeRangeTimestamps(spec.seconds, end, frameCount);
const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
return buildFrameResult(url, label, timestamps.length, result.frames, result.error);
}
const frame = await extractVideoFrame(videoInfo.absolutePath, spec.seconds);
if ("error" in frame) {
return { url, title: `Frame at ${options.timestamp}`, content: frame.error, error: frame.error };
}
return { url, title: `Frame at ${options.timestamp}`, content: `Video frame at ${options.timestamp}`, error: null, thumbnail: frame };
}
}
}
const videoInfo = isVideoFile(url);
if (videoInfo) {
const result = await extractVideo(videoInfo, signal, options);
return result ?? { url, title: "", content: "", error: "Video analysis requires Gemini access. Either:\n 1. Sign into gemini.google.com in Chrome (free, uses cookies)\n 2. Set GEMINI_API_KEY in ~/.pi/web-search.json" };
}
try {
new URL(url);
} catch {
return { url, title: "", content: "", error: "Invalid URL" };
}
try {
const ghResult = await extractGitHub(url, signal, options?.forceClone);
if (ghResult) return ghResult;
} catch {}
const ytInfo = isYouTubeURL(url);
if (ytInfo.isYouTube && isYouTubeEnabled()) {
try {
const ytResult = await extractYouTube(url, signal, options?.prompt, options?.model);
if (ytResult) return ytResult;
} catch {}
return {
url,
title: "",
content: "",
error: "Could not extract YouTube video content. Sign into Google in Chrome for automatic access, or set GEMINI_API_KEY.",
};
}
const httpResult = await extractViaHttp(url, signal, options);
if (!httpResult.error || signal?.aborted) return httpResult;
if (NON_RECOVERABLE_ERRORS.some(prefix => httpResult.error!.startsWith(prefix))) return httpResult;
const jinaResult = await extractWithJinaReader(url, signal);
if (jinaResult) return jinaResult;
const geminiResult = await extractWithUrlContext(url, signal)
?? await extractWithGeminiWeb(url, signal);
if (geminiResult) return geminiResult;
const guidance = [
httpResult.error,
"",
"Fallback options:",
" \u2022 Set GEMINI_API_KEY in ~/.pi/web-search.json",
" \u2022 Sign into gemini.google.com in Chrome",
" \u2022 Use web_search to find content about this topic",
].join("\n");
return { ...httpResult, error: guidance };
}
function isLikelyJSRendered(html: string): boolean {
// Extract body content
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
if (!bodyMatch) return false;
const bodyHtml = bodyMatch[1];
// Strip tags to get text content
const textContent = bodyHtml
.replace(/<script[\s\S]*?<\/script>/gi, "")
.replace(/<style[\s\S]*?<\/style>/gi, "")
.replace(/<[^>]+>/g, "")
.replace(/\s+/g, " ")
.trim();
// Count scripts
const scriptCount = (html.match(/<script/gi) || []).length;
// Heuristic: little text content but many scripts suggests JS rendering
return textContent.length < 500 && scriptCount > 3;
}
async function extractViaHttp(
url: string,
signal?: AbortSignal,
options?: ExtractOptions,
): Promise<ExtractedContent> {
const timeoutMs = options?.timeoutMs ?? DEFAULT_TIMEOUT_MS;
const activityId = activityMonitor.logStart({ type: "fetch", url });
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
const onAbort = () => controller.abort();
signal?.addEventListener("abort", onAbort);
try {
const response = await fetch(url, {
signal: controller.signal,
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Cache-Control": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
},
});
if (!response.ok) {
activityMonitor.logComplete(activityId, response.status);
return {
url,
title: "",
content: "",
error: `HTTP ${response.status}: ${response.statusText}`,
};
}
const contentLengthHeader = response.headers.get("content-length");
const contentType = response.headers.get("content-type") || "";
const isPDFContent = isPDF(url, contentType);
const maxResponseSize = isPDFContent ? 20 * 1024 * 1024 : 5 * 1024 * 1024;
if (contentLengthHeader) {
const contentLength = parseInt(contentLengthHeader, 10);
if (contentLength > maxResponseSize) {
activityMonitor.logComplete(activityId, response.status);
return {
url,
title: "",
content: "",
error: `Response too large (${Math.round(contentLength / 1024 / 1024)}MB)`,
};
}
}
if (isPDFContent) {
try {
const buffer = await response.arrayBuffer();
const result = await extractPDFToMarkdown(buffer, url);
activityMonitor.logComplete(activityId, response.status);
return {
url,
title: result.title,
content: `PDF extracted and saved to: ${result.outputPath}\n\nPages: ${result.pages}\nCharacters: ${result.chars}`,
error: null,
};
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
activityMonitor.logError(activityId, message);
return { url, title: "", content: "", error: `PDF extraction failed: ${message}` };
}
}
if (contentType.includes("application/octet-stream") ||
contentType.includes("image/") ||
contentType.includes("audio/") ||
contentType.includes("video/") ||
contentType.includes("application/zip")) {
activityMonitor.logComplete(activityId, response.status);
return {
url,
title: "",
content: "",
error: `Unsupported content type: ${contentType.split(";")[0]}`,
};
}
const text = await response.text();
const isHTML = contentType.includes("text/html") || contentType.includes("application/xhtml+xml");
if (!isHTML) {
activityMonitor.logComplete(activityId, response.status);
const title = extractTextTitle(text, url);
return { url, title, content: text, error: null };
}
const { document } = parseHTML(text);
const reader = new Readability(document as unknown as Document);
const article = reader.parse();
if (!article) {
const rscResult = extractRSCContent(text);
if (rscResult) {
activityMonitor.logComplete(activityId, response.status);
return { url, title: rscResult.title, content: rscResult.content, error: null };
}
activityMonitor.logComplete(activityId, response.status);
// Provide more specific error message
const jsRendered = isLikelyJSRendered(text);
const errorMsg = jsRendered
? "Page appears to be JavaScript-rendered (content loads dynamically)"
: "Could not extract readable content from HTML structure";
return {
url,
title: "",
content: "",
error: errorMsg,
};
}
const markdown = turndown.turndown(article.content);
activityMonitor.logComplete(activityId, response.status);
if (markdown.length < MIN_USEFUL_CONTENT) {
return {
url,
title: article.title || "",
content: markdown,
error: isLikelyJSRendered(text)
? "Page appears to be JavaScript-rendered (content loads dynamically)"
: "Extracted content appears incomplete",
};
}
return { url, title: article.title || "", content: markdown, error: null };
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
if (message.toLowerCase().includes("abort")) {
activityMonitor.logComplete(activityId, 0);
} else {
activityMonitor.logError(activityId, message);
}
return { url, title: "", content: "", error: message };
} finally {
clearTimeout(timeoutId);
signal?.removeEventListener("abort", onAbort);
}
}
export function extractHeadingTitle(text: string): string | null {
const match = text.match(/^#{1,2}\s+(.+)/m);
if (!match) return null;
const cleaned = match[1].replace(/\*+/g, "").trim();
return cleaned || null;
}
function extractTextTitle(text: string, url: string): string {
return extractHeadingTitle(text) ?? (new URL(url).pathname.split("/").pop() || url);
}
export async function fetchAllContent(
urls: string[],
signal?: AbortSignal,
options?: ExtractOptions,
): Promise<ExtractedContent[]> {
return Promise.all(urls.map((url) => fetchLimit(() => extractContent(url, signal, options))));
}