104 lines
3.1 KiB
JavaScript
Executable File
104 lines
3.1 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
|
|
import puppeteer from "puppeteer-core";
|
|
import { Readability } from "@mozilla/readability";
|
|
import { JSDOM } from "jsdom";
|
|
import TurndownService from "turndown";
|
|
import { gfm } from "turndown-plugin-gfm";
|
|
|
|
// Global timeout - exit if script takes too long
|
|
const TIMEOUT = 30000;
|
|
const timeoutId = setTimeout(() => {
|
|
console.error("✗ Timeout after 30s");
|
|
process.exit(1);
|
|
}, TIMEOUT).unref();
|
|
|
|
const url = process.argv[2];
|
|
|
|
if (!url) {
|
|
console.log("Usage: browser-content.js <url>");
|
|
console.log("\nExtracts readable content from a URL as markdown.");
|
|
console.log("\nExamples:");
|
|
console.log(" browser-content.js https://example.com");
|
|
console.log(" browser-content.js https://en.wikipedia.org/wiki/Rust_(programming_language)");
|
|
process.exit(1);
|
|
}
|
|
|
|
const b = await Promise.race([
|
|
puppeteer.connect({
|
|
browserURL: "http://localhost:9222",
|
|
defaultViewport: null,
|
|
}),
|
|
new Promise((_, reject) => setTimeout(() => reject(new Error("timeout")), 5000)),
|
|
]).catch((e) => {
|
|
console.error("✗ Could not connect to browser:", e.message);
|
|
console.error(" Run: browser-start.js");
|
|
process.exit(1);
|
|
});
|
|
|
|
const p = (await b.pages()).at(-1);
|
|
if (!p) {
|
|
console.error("✗ No active tab found");
|
|
process.exit(1);
|
|
}
|
|
|
|
await Promise.race([
|
|
p.goto(url, { waitUntil: "networkidle2" }),
|
|
new Promise((r) => setTimeout(r, 10000)),
|
|
]).catch(() => {});
|
|
|
|
// Get HTML via CDP (works even with TrustedScriptURL restrictions)
|
|
const client = await p.createCDPSession();
|
|
const { root } = await client.send("DOM.getDocument", { depth: -1, pierce: true });
|
|
const { outerHTML } = await client.send("DOM.getOuterHTML", { nodeId: root.nodeId });
|
|
await client.detach();
|
|
|
|
const finalUrl = p.url();
|
|
|
|
// Extract with Readability
|
|
const doc = new JSDOM(outerHTML, { url: finalUrl });
|
|
const reader = new Readability(doc.window.document);
|
|
const article = reader.parse();
|
|
|
|
// Convert to markdown
|
|
function htmlToMarkdown(html) {
|
|
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
|
turndown.use(gfm);
|
|
turndown.addRule("removeEmptyLinks", {
|
|
filter: (node) => node.nodeName === "A" && !node.textContent?.trim(),
|
|
replacement: () => "",
|
|
});
|
|
return turndown
|
|
.turndown(html)
|
|
.replace(/\[\\?\[\s*\\?\]\]\([^)]*\)/g, "")
|
|
.replace(/ +/g, " ")
|
|
.replace(/\s+,/g, ",")
|
|
.replace(/\s+\./g, ".")
|
|
.replace(/\n{3,}/g, "\n\n")
|
|
.trim();
|
|
}
|
|
|
|
let content;
|
|
if (article && article.content) {
|
|
content = htmlToMarkdown(article.content);
|
|
} else {
|
|
// Fallback
|
|
const fallbackDoc = new JSDOM(outerHTML, { url: finalUrl });
|
|
const fallbackBody = fallbackDoc.window.document;
|
|
fallbackBody.querySelectorAll("script, style, noscript, nav, header, footer, aside").forEach((el) => el.remove());
|
|
const main = fallbackBody.querySelector("main, article, [role='main'], .content, #content") || fallbackBody.body;
|
|
const fallbackHtml = main?.innerHTML || "";
|
|
if (fallbackHtml.trim().length > 100) {
|
|
content = htmlToMarkdown(fallbackHtml);
|
|
} else {
|
|
content = "(Could not extract content)";
|
|
}
|
|
}
|
|
|
|
console.log(`URL: ${finalUrl}`);
|
|
if (article?.title) console.log(`Title: ${article.title}`);
|
|
console.log("");
|
|
console.log(content);
|
|
|
|
process.exit(0);
|