561 lines
20 KiB
TypeScript
561 lines
20 KiB
TypeScript
import { Readability } from "@mozilla/readability";
|
|
import { parseHTML } from "linkedom";
|
|
import TurndownService from "turndown";
|
|
import pLimit from "p-limit";
|
|
import { activityMonitor } from "./activity.js";
|
|
import { extractRSCContent } from "./rsc-extract.js";
|
|
import { extractPDFToMarkdown, isPDF } from "./pdf-extract.js";
|
|
import { extractGitHub } from "./github-extract.js";
|
|
import { isYouTubeURL, isYouTubeEnabled, extractYouTube, extractYouTubeFrame, extractYouTubeFrames, getYouTubeStreamInfo } from "./youtube-extract.js";
|
|
import { extractWithUrlContext, extractWithGeminiWeb } from "./gemini-url-context.js";
|
|
import { isVideoFile, extractVideo, extractVideoFrame, getLocalVideoDuration } from "./video-extract.js";
|
|
import { formatSeconds } from "./utils.js";
|
|
|
|
const DEFAULT_TIMEOUT_MS = 30000;
|
|
const CONCURRENT_LIMIT = 3;
|
|
|
|
const NON_RECOVERABLE_ERRORS = ["Unsupported content type", "Response too large"];
|
|
const MIN_USEFUL_CONTENT = 500;
|
|
|
|
const turndown = new TurndownService({
|
|
headingStyle: "atx",
|
|
codeBlockStyle: "fenced",
|
|
});
|
|
|
|
const fetchLimit = pLimit(CONCURRENT_LIMIT);
|
|
|
|
export interface VideoFrame {
|
|
data: string;
|
|
mimeType: string;
|
|
timestamp: string;
|
|
}
|
|
|
|
export type FrameData = { data: string; mimeType: string };
|
|
export type FrameResult = FrameData | { error: string };
|
|
|
|
export interface ExtractedContent {
|
|
url: string;
|
|
title: string;
|
|
content: string;
|
|
error: string | null;
|
|
thumbnail?: { data: string; mimeType: string };
|
|
frames?: VideoFrame[];
|
|
duration?: number;
|
|
}
|
|
|
|
export interface ExtractOptions {
|
|
timeoutMs?: number;
|
|
forceClone?: boolean;
|
|
prompt?: string;
|
|
timestamp?: string;
|
|
frames?: number;
|
|
model?: string;
|
|
}
|
|
|
|
const JINA_READER_BASE = "https://r.jina.ai/";
|
|
const JINA_TIMEOUT_MS = 30000;
|
|
|
|
async function extractWithJinaReader(
|
|
url: string,
|
|
signal?: AbortSignal,
|
|
): Promise<ExtractedContent | null> {
|
|
const jinaUrl = JINA_READER_BASE + url;
|
|
|
|
const activityId = activityMonitor.logStart({ type: "api", query: `jina: ${url}` });
|
|
|
|
try {
|
|
const res = await fetch(jinaUrl, {
|
|
headers: {
|
|
"Accept": "text/markdown",
|
|
"X-No-Cache": "true",
|
|
},
|
|
signal: AbortSignal.any([
|
|
AbortSignal.timeout(JINA_TIMEOUT_MS),
|
|
...(signal ? [signal] : []),
|
|
]),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
activityMonitor.logComplete(activityId, res.status);
|
|
return null;
|
|
}
|
|
|
|
const content = await res.text();
|
|
activityMonitor.logComplete(activityId, res.status);
|
|
|
|
const contentStart = content.indexOf("Markdown Content:");
|
|
if (contentStart < 0) {
|
|
return null;
|
|
}
|
|
|
|
const markdownPart = content.slice(contentStart + 17).trim(); // 17 = "Markdown Content:".length
|
|
|
|
// Check for failed JS rendering or minimal content
|
|
if (markdownPart.length < 100 ||
|
|
markdownPart.startsWith("Loading...") ||
|
|
markdownPart.startsWith("Please enable JavaScript")) {
|
|
return null;
|
|
}
|
|
|
|
const title = extractHeadingTitle(markdownPart) ?? (new URL(url).pathname.split("/").pop() || url);
|
|
return { url, title, content: markdownPart, error: null };
|
|
} catch (err) {
|
|
const message = err instanceof Error ? err.message : String(err);
|
|
if (message.toLowerCase().includes("abort")) {
|
|
activityMonitor.logComplete(activityId, 0);
|
|
} else {
|
|
activityMonitor.logError(activityId, message);
|
|
}
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function parseTimestamp(ts: string): number | null {
|
|
const num = Number(ts);
|
|
if (!isNaN(num) && num >= 0) return Math.floor(num);
|
|
const parts = ts.split(":").map(Number);
|
|
if (parts.some(p => isNaN(p) || p < 0)) return null;
|
|
if (parts.length === 3) return Math.floor(parts[0] * 3600 + parts[1] * 60 + parts[2]);
|
|
if (parts.length === 2) return Math.floor(parts[0] * 60 + parts[1]);
|
|
return null;
|
|
}
|
|
|
|
type TimestampSpec = { type: "single"; seconds: number } | { type: "range"; start: number; end: number };
|
|
|
|
function parseTimestampSpec(ts: string): TimestampSpec | null {
|
|
const dashIdx = ts.indexOf("-", 1);
|
|
if (dashIdx > 0) {
|
|
const start = parseTimestamp(ts.slice(0, dashIdx));
|
|
const end = parseTimestamp(ts.slice(dashIdx + 1));
|
|
if (start !== null && end !== null && end > start) return { type: "range", start, end };
|
|
}
|
|
const seconds = parseTimestamp(ts);
|
|
return seconds !== null ? { type: "single", seconds } : null;
|
|
}
|
|
|
|
const DEFAULT_RANGE_FRAMES = 6;
|
|
const MIN_FRAME_INTERVAL = 5;
|
|
|
|
function computeRangeTimestamps(start: number, end: number, maxFrames: number = DEFAULT_RANGE_FRAMES): number[] {
|
|
if (maxFrames <= 1) return [start];
|
|
const duration = end - start;
|
|
const idealInterval = duration / (maxFrames - 1);
|
|
if (idealInterval < MIN_FRAME_INTERVAL) {
|
|
const timestamps: number[] = [];
|
|
for (let t = start; t <= end && timestamps.length < maxFrames; t += MIN_FRAME_INTERVAL) {
|
|
timestamps.push(t);
|
|
}
|
|
return timestamps;
|
|
}
|
|
return Array.from({ length: maxFrames }, (_, i) => Math.round(start + i * idealInterval));
|
|
}
|
|
|
|
function buildFrameResult(
|
|
url: string, label: string, requestedCount: number,
|
|
frames: VideoFrame[], error: string | null, duration?: number,
|
|
): ExtractedContent {
|
|
if (frames.length === 0) {
|
|
const msg = error ?? "Frame extraction failed";
|
|
return { url, title: `Frames ${label} (0/${requestedCount})`, content: msg, error: msg };
|
|
}
|
|
return {
|
|
url,
|
|
title: `Frames ${label} (${frames.length}/${requestedCount})`,
|
|
content: `${frames.length} frames extracted from ${label}`,
|
|
error: null,
|
|
frames,
|
|
duration,
|
|
};
|
|
}
|
|
|
|
async function extractLocalFrames(
|
|
filePath: string, timestamps: number[],
|
|
): Promise<{ frames: VideoFrame[]; error: string | null }> {
|
|
const results = await Promise.all(timestamps.map(async (t) => {
|
|
const frame = await extractVideoFrame(filePath, t);
|
|
if ("error" in frame) return { error: frame.error };
|
|
return { ...frame, timestamp: formatSeconds(t) };
|
|
}));
|
|
const frames = results.filter((f): f is VideoFrame => "data" in f);
|
|
const firstError = results.find((f): f is { error: string } => "error" in f);
|
|
return { frames, error: frames.length === 0 && firstError ? firstError.error : null };
|
|
}
|
|
|
|
export async function extractContent(
|
|
url: string,
|
|
signal?: AbortSignal,
|
|
options?: ExtractOptions,
|
|
): Promise<ExtractedContent> {
|
|
if (signal?.aborted) {
|
|
return { url, title: "", content: "", error: "Aborted" };
|
|
}
|
|
|
|
if (options?.frames && !options.timestamp) {
|
|
const frameCount = options.frames;
|
|
const ytInfo = isYouTubeURL(url);
|
|
if (ytInfo.isYouTube && ytInfo.videoId) {
|
|
const streamInfo = await getYouTubeStreamInfo(ytInfo.videoId);
|
|
if ("error" in streamInfo) {
|
|
return { url, title: "Frames", content: streamInfo.error, error: streamInfo.error };
|
|
}
|
|
if (streamInfo.duration === null) {
|
|
const error = "Cannot determine video duration. Use a timestamp range instead.";
|
|
return { url, title: "Frames", content: error, error };
|
|
}
|
|
const dur = Math.floor(streamInfo.duration);
|
|
const timestamps = computeRangeTimestamps(0, dur, frameCount);
|
|
const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
|
|
const label = `${formatSeconds(0)}-${formatSeconds(dur)}`;
|
|
return buildFrameResult(url, label, timestamps.length, result.frames, result.error, streamInfo.duration);
|
|
}
|
|
|
|
const videoInfo = isVideoFile(url);
|
|
if (videoInfo) {
|
|
const durationResult = await getLocalVideoDuration(videoInfo.absolutePath);
|
|
if (typeof durationResult !== "number") {
|
|
return { url, title: "Frames", content: durationResult.error, error: durationResult.error };
|
|
}
|
|
const dur = Math.floor(durationResult);
|
|
const timestamps = computeRangeTimestamps(0, dur, frameCount);
|
|
const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
|
|
const label = `${formatSeconds(0)}-${formatSeconds(dur)}`;
|
|
return buildFrameResult(url, label, timestamps.length, result.frames, result.error, durationResult);
|
|
}
|
|
|
|
return { url, title: "", content: "", error: "Frame extraction only works with YouTube and local video files" };
|
|
}
|
|
|
|
if (options?.timestamp) {
|
|
const spec = parseTimestampSpec(options.timestamp);
|
|
if (spec) {
|
|
const frameCount = options.frames;
|
|
const ytInfo = isYouTubeURL(url);
|
|
if (ytInfo.isYouTube && ytInfo.videoId) {
|
|
const streamInfo = await getYouTubeStreamInfo(ytInfo.videoId);
|
|
if ("error" in streamInfo) {
|
|
if (spec.type === "range") {
|
|
const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
|
|
return { url, title: `Frames ${label}`, content: streamInfo.error, error: streamInfo.error };
|
|
}
|
|
if (frameCount) {
|
|
const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
|
|
const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
|
|
return { url, title: `Frames ${label}`, content: streamInfo.error, error: streamInfo.error };
|
|
}
|
|
return { url, title: `Frame at ${options.timestamp}`, content: streamInfo.error, error: streamInfo.error };
|
|
}
|
|
|
|
if (spec.type === "range") {
|
|
const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
|
|
if (streamInfo.duration !== null && spec.end > streamInfo.duration) {
|
|
const error = `Timestamp ${formatSeconds(spec.end)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
|
|
return { url, title: `Frames ${label}`, content: error, error };
|
|
}
|
|
const timestamps = frameCount
|
|
? computeRangeTimestamps(spec.start, spec.end, frameCount)
|
|
: computeRangeTimestamps(spec.start, spec.end);
|
|
const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
|
|
return buildFrameResult(url, label, timestamps.length, result.frames, result.error, result.duration ?? undefined);
|
|
}
|
|
|
|
if (frameCount) {
|
|
const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
|
|
const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
|
|
if (streamInfo.duration !== null && end > streamInfo.duration) {
|
|
const error = `Timestamp ${formatSeconds(end)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
|
|
return { url, title: `Frames ${label}`, content: error, error };
|
|
}
|
|
const timestamps = computeRangeTimestamps(spec.seconds, end, frameCount);
|
|
const result = await extractYouTubeFrames(ytInfo.videoId, timestamps, streamInfo);
|
|
return buildFrameResult(url, label, timestamps.length, result.frames, result.error, result.duration ?? undefined);
|
|
}
|
|
|
|
if (streamInfo.duration !== null && spec.seconds > streamInfo.duration) {
|
|
const error = `Timestamp ${formatSeconds(spec.seconds)} exceeds video duration (${formatSeconds(Math.floor(streamInfo.duration))})`;
|
|
return { url, title: `Frame at ${options.timestamp}`, content: error, error };
|
|
}
|
|
const frame = await extractYouTubeFrame(ytInfo.videoId, spec.seconds, streamInfo);
|
|
if ("error" in frame) {
|
|
return { url, title: `Frame at ${options.timestamp}`, content: frame.error, error: frame.error };
|
|
}
|
|
return { url, title: `Frame at ${options.timestamp}`, content: `Video frame at ${options.timestamp}`, error: null, thumbnail: frame };
|
|
}
|
|
|
|
const videoInfo = isVideoFile(url);
|
|
if (videoInfo) {
|
|
if (spec.type === "range") {
|
|
const timestamps = frameCount
|
|
? computeRangeTimestamps(spec.start, spec.end, frameCount)
|
|
: computeRangeTimestamps(spec.start, spec.end);
|
|
const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
|
|
const label = `${formatSeconds(spec.start)}-${formatSeconds(spec.end)}`;
|
|
return buildFrameResult(url, label, timestamps.length, result.frames, result.error);
|
|
}
|
|
|
|
if (frameCount) {
|
|
const end = spec.seconds + (frameCount - 1) * MIN_FRAME_INTERVAL;
|
|
const timestamps = computeRangeTimestamps(spec.seconds, end, frameCount);
|
|
const result = await extractLocalFrames(videoInfo.absolutePath, timestamps);
|
|
const label = `${formatSeconds(spec.seconds)}-${formatSeconds(end)}`;
|
|
return buildFrameResult(url, label, timestamps.length, result.frames, result.error);
|
|
}
|
|
|
|
const frame = await extractVideoFrame(videoInfo.absolutePath, spec.seconds);
|
|
if ("error" in frame) {
|
|
return { url, title: `Frame at ${options.timestamp}`, content: frame.error, error: frame.error };
|
|
}
|
|
return { url, title: `Frame at ${options.timestamp}`, content: `Video frame at ${options.timestamp}`, error: null, thumbnail: frame };
|
|
}
|
|
}
|
|
}
|
|
|
|
const videoInfo = isVideoFile(url);
|
|
if (videoInfo) {
|
|
const result = await extractVideo(videoInfo, signal, options);
|
|
return result ?? { url, title: "", content: "", error: "Video analysis requires Gemini access. Either:\n 1. Sign into gemini.google.com in Chrome (free, uses cookies)\n 2. Set GEMINI_API_KEY in ~/.pi/web-search.json" };
|
|
}
|
|
|
|
try {
|
|
new URL(url);
|
|
} catch {
|
|
return { url, title: "", content: "", error: "Invalid URL" };
|
|
}
|
|
|
|
try {
|
|
const ghResult = await extractGitHub(url, signal, options?.forceClone);
|
|
if (ghResult) return ghResult;
|
|
} catch {}
|
|
|
|
const ytInfo = isYouTubeURL(url);
|
|
if (ytInfo.isYouTube && isYouTubeEnabled()) {
|
|
try {
|
|
const ytResult = await extractYouTube(url, signal, options?.prompt, options?.model);
|
|
if (ytResult) return ytResult;
|
|
} catch {}
|
|
return {
|
|
url,
|
|
title: "",
|
|
content: "",
|
|
error: "Could not extract YouTube video content. Sign into Google in Chrome for automatic access, or set GEMINI_API_KEY.",
|
|
};
|
|
}
|
|
|
|
const httpResult = await extractViaHttp(url, signal, options);
|
|
|
|
if (!httpResult.error || signal?.aborted) return httpResult;
|
|
if (NON_RECOVERABLE_ERRORS.some(prefix => httpResult.error!.startsWith(prefix))) return httpResult;
|
|
|
|
const jinaResult = await extractWithJinaReader(url, signal);
|
|
if (jinaResult) return jinaResult;
|
|
|
|
const geminiResult = await extractWithUrlContext(url, signal)
|
|
?? await extractWithGeminiWeb(url, signal);
|
|
|
|
if (geminiResult) return geminiResult;
|
|
|
|
const guidance = [
|
|
httpResult.error,
|
|
"",
|
|
"Fallback options:",
|
|
" \u2022 Set GEMINI_API_KEY in ~/.pi/web-search.json",
|
|
" \u2022 Sign into gemini.google.com in Chrome",
|
|
" \u2022 Use web_search to find content about this topic",
|
|
].join("\n");
|
|
return { ...httpResult, error: guidance };
|
|
}
|
|
|
|
function isLikelyJSRendered(html: string): boolean {
|
|
// Extract body content
|
|
const bodyMatch = html.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
|
|
if (!bodyMatch) return false;
|
|
|
|
const bodyHtml = bodyMatch[1];
|
|
|
|
// Strip tags to get text content
|
|
const textContent = bodyHtml
|
|
.replace(/<script[\s\S]*?<\/script>/gi, "")
|
|
.replace(/<style[\s\S]*?<\/style>/gi, "")
|
|
.replace(/<[^>]+>/g, "")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
|
|
// Count scripts
|
|
const scriptCount = (html.match(/<script/gi) || []).length;
|
|
|
|
// Heuristic: little text content but many scripts suggests JS rendering
|
|
return textContent.length < 500 && scriptCount > 3;
|
|
}
|
|
|
|
async function extractViaHttp(
|
|
url: string,
|
|
signal?: AbortSignal,
|
|
options?: ExtractOptions,
|
|
): Promise<ExtractedContent> {
|
|
const timeoutMs = options?.timeoutMs ?? DEFAULT_TIMEOUT_MS;
|
|
const activityId = activityMonitor.logStart({ type: "fetch", url });
|
|
|
|
const controller = new AbortController();
|
|
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
|
|
const onAbort = () => controller.abort();
|
|
signal?.addEventListener("abort", onAbort);
|
|
|
|
try {
|
|
const response = await fetch(url, {
|
|
signal: controller.signal,
|
|
headers: {
|
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Cache-Control": "no-cache",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Sec-Fetch-User": "?1",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
},
|
|
});
|
|
|
|
if (!response.ok) {
|
|
activityMonitor.logComplete(activityId, response.status);
|
|
return {
|
|
url,
|
|
title: "",
|
|
content: "",
|
|
error: `HTTP ${response.status}: ${response.statusText}`,
|
|
};
|
|
}
|
|
|
|
const contentLengthHeader = response.headers.get("content-length");
|
|
const contentType = response.headers.get("content-type") || "";
|
|
const isPDFContent = isPDF(url, contentType);
|
|
const maxResponseSize = isPDFContent ? 20 * 1024 * 1024 : 5 * 1024 * 1024;
|
|
if (contentLengthHeader) {
|
|
const contentLength = parseInt(contentLengthHeader, 10);
|
|
if (contentLength > maxResponseSize) {
|
|
activityMonitor.logComplete(activityId, response.status);
|
|
return {
|
|
url,
|
|
title: "",
|
|
content: "",
|
|
error: `Response too large (${Math.round(contentLength / 1024 / 1024)}MB)`,
|
|
};
|
|
}
|
|
}
|
|
|
|
if (isPDFContent) {
|
|
try {
|
|
const buffer = await response.arrayBuffer();
|
|
const result = await extractPDFToMarkdown(buffer, url);
|
|
activityMonitor.logComplete(activityId, response.status);
|
|
return {
|
|
url,
|
|
title: result.title,
|
|
content: `PDF extracted and saved to: ${result.outputPath}\n\nPages: ${result.pages}\nCharacters: ${result.chars}`,
|
|
error: null,
|
|
};
|
|
} catch (err) {
|
|
const message = err instanceof Error ? err.message : String(err);
|
|
activityMonitor.logError(activityId, message);
|
|
return { url, title: "", content: "", error: `PDF extraction failed: ${message}` };
|
|
}
|
|
}
|
|
|
|
if (contentType.includes("application/octet-stream") ||
|
|
contentType.includes("image/") ||
|
|
contentType.includes("audio/") ||
|
|
contentType.includes("video/") ||
|
|
contentType.includes("application/zip")) {
|
|
activityMonitor.logComplete(activityId, response.status);
|
|
return {
|
|
url,
|
|
title: "",
|
|
content: "",
|
|
error: `Unsupported content type: ${contentType.split(";")[0]}`,
|
|
};
|
|
}
|
|
|
|
const text = await response.text();
|
|
const isHTML = contentType.includes("text/html") || contentType.includes("application/xhtml+xml");
|
|
|
|
if (!isHTML) {
|
|
activityMonitor.logComplete(activityId, response.status);
|
|
const title = extractTextTitle(text, url);
|
|
return { url, title, content: text, error: null };
|
|
}
|
|
|
|
const { document } = parseHTML(text);
|
|
const reader = new Readability(document as unknown as Document);
|
|
const article = reader.parse();
|
|
|
|
if (!article) {
|
|
const rscResult = extractRSCContent(text);
|
|
if (rscResult) {
|
|
activityMonitor.logComplete(activityId, response.status);
|
|
return { url, title: rscResult.title, content: rscResult.content, error: null };
|
|
}
|
|
|
|
activityMonitor.logComplete(activityId, response.status);
|
|
|
|
// Provide more specific error message
|
|
const jsRendered = isLikelyJSRendered(text);
|
|
const errorMsg = jsRendered
|
|
? "Page appears to be JavaScript-rendered (content loads dynamically)"
|
|
: "Could not extract readable content from HTML structure";
|
|
|
|
return {
|
|
url,
|
|
title: "",
|
|
content: "",
|
|
error: errorMsg,
|
|
};
|
|
}
|
|
|
|
const markdown = turndown.turndown(article.content);
|
|
activityMonitor.logComplete(activityId, response.status);
|
|
|
|
if (markdown.length < MIN_USEFUL_CONTENT) {
|
|
return {
|
|
url,
|
|
title: article.title || "",
|
|
content: markdown,
|
|
error: isLikelyJSRendered(text)
|
|
? "Page appears to be JavaScript-rendered (content loads dynamically)"
|
|
: "Extracted content appears incomplete",
|
|
};
|
|
}
|
|
|
|
return { url, title: article.title || "", content: markdown, error: null };
|
|
} catch (err) {
|
|
const message = err instanceof Error ? err.message : String(err);
|
|
if (message.toLowerCase().includes("abort")) {
|
|
activityMonitor.logComplete(activityId, 0);
|
|
} else {
|
|
activityMonitor.logError(activityId, message);
|
|
}
|
|
return { url, title: "", content: "", error: message };
|
|
} finally {
|
|
clearTimeout(timeoutId);
|
|
signal?.removeEventListener("abort", onAbort);
|
|
}
|
|
}
|
|
|
|
export function extractHeadingTitle(text: string): string | null {
|
|
const match = text.match(/^#{1,2}\s+(.+)/m);
|
|
if (!match) return null;
|
|
const cleaned = match[1].replace(/\*+/g, "").trim();
|
|
return cleaned || null;
|
|
}
|
|
|
|
function extractTextTitle(text: string, url: string): string {
|
|
return extractHeadingTitle(text) ?? (new URL(url).pathname.split("/").pop() || url);
|
|
}
|
|
|
|
export async function fetchAllContent(
|
|
urls: string[],
|
|
signal?: AbortSignal,
|
|
options?: ExtractOptions,
|
|
): Promise<ExtractedContent[]> {
|
|
return Promise.all(urls.map((url) => fetchLimit(() => extractContent(url, signal, options))));
|
|
}
|