import { Readability } from "@mozilla/readability"; import { parseHTML } from "linkedom"; import TurndownService from "turndown"; import pLimit from "p-limit"; import { activityMonitor } from "./activity.js"; import { extractRSCContent } from "./rsc-extract.js"; import { extractPDFToMarkdown, isPDF } from "./pdf-extract.js"; import { extractGitHub } from "./github-extract.js"; import { isYouTubeURL, isYouTubeEnabled, extractYouTube, extractYouTubeFrame, extractYouTubeFrames, getYouTubeStreamInfo } from "./youtube-extract.js"; import { extractWithUrlContext, extractWithGeminiWeb } from "./gemini-url-context.js"; import { isVideoFile, extractVideo, extractVideoFrame, getLocalVideoDuration } from "./video-extract.js"; import { formatSeconds } from "./utils.js"; const DEFAULT_TIMEOUT_MS = 30000; const CONCURRENT_LIMIT = 3; const NON_RECOVERABLE_ERRORS = ["Unsupported content type", "Response too large"]; const MIN_USEFUL_CONTENT = 500; const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced", }); const fetchLimit = pLimit(CONCURRENT_LIMIT); export interface VideoFrame { data: string; mimeType: string; timestamp: string; } export type FrameData = { data: string; mimeType: string }; export type FrameResult = FrameData | { error: string }; export interface ExtractedContent { url: string; title: string; content: string; error: string | null; thumbnail?: { data: string; mimeType: string }; frames?: VideoFrame[]; duration?: number; } export interface ExtractOptions { timeoutMs?: number; forceClone?: boolean; prompt?: string; timestamp?: string; frames?: number; model?: string; } const JINA_READER_BASE = "https://r.jina.ai/"; const JINA_TIMEOUT_MS = 30000; async function extractWithJinaReader( url: string, signal?: AbortSignal, ): Promise { const jinaUrl = JINA_READER_BASE + url; const activityId = activityMonitor.logStart({ type: "api", query: `jina: ${url}` }); try { const res = await fetch(jinaUrl, { headers: { "Accept": "text/markdown", "X-No-Cache": "true", }, signal: AbortSignal.any([ AbortSignal.timeout(JINA_TIMEOUT_MS), ...(signal ? [signal] : []), ]), }); if (!res.ok) { activityMonitor.logComplete(activityId, res.status); return null; } const content = await res.text(); activityMonitor.logComplete(activityId, res.status); const contentStart = content.indexOf("Markdown Content:"); if (contentStart < 0) { return null; } const markdownPart = content.slice(contentStart + 17).trim(); // 17 = "Markdown Content:".length // Check for failed JS rendering or minimal content if (markdownPart.length < 100 || markdownPart.startsWith("Loading...") || markdownPart.startsWith("Please enable JavaScript")) { return null; } const title = extractHeadingTitle(markdownPart) ?? (new URL(url).pathname.split("/").pop() || url); return { url, title, content: markdownPart, error: null }; } catch (err) { const message = err instanceof Error ? err.message : String(err); if (message.toLowerCase().includes("abort")) { activityMonitor.logComplete(activityId, 0); } else { activityMonitor.logError(activityId, message); } return null; } } function parseTimestamp(ts: string): number | null { const num = Number(ts); if (!isNaN(num) && num >= 0) return Math.floor(num); const parts = ts.split(":").map(Number); if (parts.some(p => isNaN(p) || p < 0)) return null; if (parts.length === 3) return Math.floor(parts[0] * 3600 + parts[1] * 60 + parts[2]); if (parts.length === 2) return Math.floor(parts[0] * 60 + parts[1]); return null; } type TimestampSpec = { type: "single"; seconds: number } | { type: "range"; start: number; end: number }; function parseTimestampSpec(ts: string): TimestampSpec | null { const dashIdx = ts.indexOf("-", 1); if (dashIdx > 0) { const start = parseTimestamp(ts.slice(0, dashIdx)); const end = parseTimestamp(ts.slice(dashIdx + 1)); if (start !== null && end !== null && end > start) return { type: "range", start, end }; } const seconds = parseTimestamp(ts); return seconds !== null ? { type: "single", seconds } : null; } const DEFAULT_RANGE_FRAMES = 6; const MIN_FRAME_INTERVAL = 5; function computeRangeTimestamps(start: number, end: number, maxFrames: number = DEFAULT_RANGE_FRAMES): number[] { if (maxFrames <= 1) return [start]; const duration = end - start; const idealInterval = duration / (maxFrames - 1); if (idealInterval < MIN_FRAME_INTERVAL) { const timestamps: number[] = []; for (let t = start; t <= end && timestamps.length < maxFrames; t += MIN_FRAME_INTERVAL) { timestamps.push(t); } return timestamps; } return Array.from({ length: maxFrames }, (_, i) => Math.round(start + i * idealInterval)); } function buildFrameResult( url: string, label: string, requestedCount: number, frames: VideoFrame[], error: string | null, duration?: number, ): ExtractedContent { if (frames.length === 0) { const msg = error ?? "Frame extraction failed"; return { url, title: `Frames ${label} (0/${requestedCount})`, content: msg, error: msg }; } return { url, title: `Frames ${label} (${frames.length}/${requestedCount})`, content: `${frames.length} frames extracted from ${label}`, error: null, frames, duration, }; } async function extractLocalFrames( filePath: string, timestamps: number[], ): Promise<{ frames: VideoFrame[]; error: string | null }> { const results = await Promise.all(timestamps.map(async (t) => { const frame = await extractVideoFrame(filePath, t); if ("error" in frame) return { error: frame.error }; return { ...frame, timestamp: formatSeconds(t) }; })); const frames = results.filter((f): f is VideoFrame => "data" in f); const firstError = results.find((f): f is { error: string } => "error" in f); return { frames, error: frames.length === 0 && firstError ? firstError.error : null }; } export async function extractContent( url: string, signal?: AbortSignal, options?: ExtractOptions, ): Promise { if (signal?.aborted) { return { url, title: "", content: "", error: "Aborted" }; } if (options?.frames && !options.timestamp) { const frameCount = options.frames; const ytInfo = isYouTubeURL(url); if (ytInfo.isYouTube && ytInfo.videoId) { const streamInfo = await getYouTubeStreamInfo(ytInfo.videoId); if ("error" in streamInfo) { return { url, title: "Frames", content: streamInfo.error, error: streamInfo.error }; } if (streamInfo.duration === null) { const error = "Cannot determine video duration. Use a timestamp range instead."; return { url, title: "Frames", content: error, error }; } const dur = Math.floor(streamInfo.duration); const timestamps = computeRangeTimestamps(0, dur, frameCount); const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo); const label = `${formatSeconds(0)}-${formatSeconds(dur)}`; return buildFrameResult(url, label, timestamps.length, result.frames, result.error, streamInfo.duration); } const videoInfo = isVideoFile(url); if (videoInfo) { const durationResult = await getLocalVideoDuration(videoInfo.absolutePath); if (typeof durationResult !== "number") { return { url, title: "Frames", content: durationResult.error, error: durationResult.error }; } const dur = Math.floor(durationResult); const timestamps = computeRangeTimestamps(0, dur, frameCount); const result = await extractLocalFrames(videoInfo.absolutePath, timestamps); const label = `${formatSeconds(0)}-${formatSeconds(dur)}`; return buildFrameResult(url, label, timestamps.length, result.frames, result.error, durationResult); } return { url, title: "", content: "", error: "Frame extraction only works with YouTube and local video files" }; } if (options?.timestamp) { const spec = parseTimestampSpec(options.timestamp); if (spec) { const frameCount = options.frames; const ytInfo = isYouTubeURL(url); if (ytInfo.isYouTube && ytInfo.videoId) { const streamInfo = await getYouTubeStreamInfo(ytInfo.videoId); if ("error" in streamInfo) { if (spec.type === "range") { const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`; return { url, title: `Frames ${label}`, content: streamInfo.error, error: streamInfo.error }; } if (frameCount) { const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL; const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`; return { url, title: `Frames ${label}`, content: streamInfo.error, error: streamInfo.error }; } return { url, title: `Frame at ${options.timestamp}`, content: streamInfo.error, error: streamInfo.error }; } if (spec.type === "range") { const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`; if (streamInfo.duration !== null && spec.end > streamInfo.duration) { const error = `Timestamp ${formatSeconds(spec.end)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`; return { url, title: `Frames ${label}`, content: error, error }; } const timestamps = frameCount ? computeRangeTimestamps(spec.start, spec.end, frameCount) : computeRangeTimestamps(spec.start, spec.end); const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo); return buildFrameResult(url, label, timestamps.length, result.frames, result.error, result.duration ?? undefined); } if (frameCount) { const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL; const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`; if (streamInfo.duration !== null && end > streamInfo.duration) { const error = `Timestamp ${formatSeconds(end)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`; return { url, title: `Frames ${label}`, content: error, error }; } const timestamps = computeRangeTimestamps(spec.seconds, end, frameCount); const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo); return buildFrameResult(url, label, timestamps.length, result.frames, result.error, result.duration ?? undefined); } if (streamInfo.duration !== null && spec.seconds > streamInfo.duration) { const error = `Timestamp ${formatSeconds(spec.seconds)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`; return { url, title: `Frame at ${options.timestamp}`, content: error, error }; } const frame = await extractYouTubeFrame(ytInfo.videoId, spec.seconds, streamInfo); if ("error" in frame) { return { url, title: `Frame at ${options.timestamp}`, content: frame.error, error: frame.error }; } return { url, title: `Frame at ${options.timestamp}`, content: `Video frame at ${options.timestamp}`, error: null, thumbnail: frame }; } const videoInfo = isVideoFile(url); if (videoInfo) { if (spec.type === "range") { const timestamps = frameCount ? computeRangeTimestamps(spec.start, spec.end, frameCount) : computeRangeTimestamps(spec.start, spec.end); const result = await extractLocalFrames(videoInfo.absolutePath, timestamps); const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`; return buildFrameResult(url, label, timestamps.length, result.frames, result.error); } if (frameCount) { const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL; const timestamps = computeRangeTimestamps(spec.seconds, end, frameCount); const result = await extractLocalFrames(videoInfo.absolutePath, timestamps); const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`; return buildFrameResult(url, label, timestamps.length, result.frames, result.error); } const frame = await extractVideoFrame(videoInfo.absolutePath, spec.seconds); if ("error" in frame) { return { url, title: `Frame at ${options.timestamp}`, content: frame.error, error: frame.error }; } return { url, title: `Frame at ${options.timestamp}`, content: `Video frame at ${options.timestamp}`, error: null, thumbnail: frame }; } } } const videoInfo = isVideoFile(url); if (videoInfo) { const result = await extractVideo(videoInfo, signal, options); return result ?? { url, title: "", content: "", error: "Video analysis requires Gemini access. Either:\n 1. Sign into gemini.google.com in Chrome (free, uses cookies)\n 2. Set GEMINI_API_KEY in ~/.pi/web-search.json" }; } try { new URL(url); } catch { return { url, title: "", content: "", error: "Invalid URL" }; } try { const ghResult = await extractGitHub(url, signal, options?.forceClone); if (ghResult) return ghResult; } catch {} const ytInfo = isYouTubeURL(url); if (ytInfo.isYouTube && isYouTubeEnabled()) { try { const ytResult = await extractYouTube(url, signal, options?.prompt, options?.model); if (ytResult) return ytResult; } catch {} return { url, title: "", content: "", error: "Could not extract YouTube video content. Sign into Google in Chrome for automatic access, or set GEMINI_API_KEY.", }; } const httpResult = await extractViaHttp(url, signal, options); if (!httpResult.error || signal?.aborted) return httpResult; if (NON_RECOVERABLE_ERRORS.some(prefix => httpResult.error!.startsWith(prefix))) return httpResult; const jinaResult = await extractWithJinaReader(url, signal); if (jinaResult) return jinaResult; const geminiResult = await extractWithUrlContext(url, signal) ?? await extractWithGeminiWeb(url, signal); if (geminiResult) return geminiResult; const guidance = [ httpResult.error, "", "Fallback options:", " \u2022 Set GEMINI_API_KEY in ~/.pi/web-search.json", " \u2022 Sign into gemini.google.com in Chrome", " \u2022 Use web_search to find content about this topic", ].join("\n"); return { ...httpResult, error: guidance }; } function isLikelyJSRendered(html: string): boolean { // Extract body content const bodyMatch = html.match(/]*>([\s\S]*?)<\/body>/i); if (!bodyMatch) return false; const bodyHtml = bodyMatch[1]; // Strip tags to get text content const textContent = bodyHtml .replace(//gi, "") .replace(//gi, "") .replace(/<[^>]+>/g, "") .replace(/\s+/g, " ") .trim(); // Count scripts const scriptCount = (html.match(/