Files
dotfiles/pi/files/agent/skills/pi-skills/browser-tools/browser-content.js
T
Thomas G. Lopes 7edfb90d5f skills
2026-02-24 16:54:30 +00:00

104 lines
3.1 KiB
JavaScript
Executable File

#!/usr/bin/env node
import puppeteer from "puppeteer-core";
import { Readability } from "@mozilla/readability";
import { JSDOM } from "jsdom";
import TurndownService from "turndown";
import { gfm } from "turndown-plugin-gfm";
// Global timeout - exit if script takes too long
const TIMEOUT = 30000;
const timeoutId = setTimeout(() => {
console.error("✗ Timeout after 30s");
process.exit(1);
}, TIMEOUT).unref();
const url = process.argv[2];
if (!url) {
console.log("Usage: browser-content.js <url>");
console.log("\nExtracts readable content from a URL as markdown.");
console.log("\nExamples:");
console.log(" browser-content.js https://example.com");
console.log(" browser-content.js https://en.wikipedia.org/wiki/Rust_(programming_language)");
process.exit(1);
}
const b = await Promise.race([
puppeteer.connect({
browserURL: "http://localhost:9222",
defaultViewport: null,
}),
new Promise((_, reject) => setTimeout(() => reject(new Error("timeout")), 5000)),
]).catch((e) => {
console.error("✗ Could not connect to browser:", e.message);
console.error(" Run: browser-start.js");
process.exit(1);
});
const p = (await b.pages()).at(-1);
if (!p) {
console.error("✗ No active tab found");
process.exit(1);
}
await Promise.race([
p.goto(url, { waitUntil: "networkidle2" }),
new Promise((r) => setTimeout(r, 10000)),
]).catch(() => {});
// Get HTML via CDP (works even with TrustedScriptURL restrictions)
const client = await p.createCDPSession();
const { root } = await client.send("DOM.getDocument", { depth: -1, pierce: true });
const { outerHTML } = await client.send("DOM.getOuterHTML", { nodeId: root.nodeId });
await client.detach();
const finalUrl = p.url();
// Extract with Readability
const doc = new JSDOM(outerHTML, { url: finalUrl });
const reader = new Readability(doc.window.document);
const article = reader.parse();
// Convert to markdown
function htmlToMarkdown(html) {
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
turndown.use(gfm);
turndown.addRule("removeEmptyLinks", {
filter: (node) => node.nodeName === "A" && !node.textContent?.trim(),
replacement: () => "",
});
return turndown
.turndown(html)
.replace(/\[\\?\[\s*\\?\]\]\([^)]*\)/g, "")
.replace(/ +/g, " ")
.replace(/\s+,/g, ",")
.replace(/\s+\./g, ".")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
let content;
if (article && article.content) {
content = htmlToMarkdown(article.content);
} else {
// Fallback
const fallbackDoc = new JSDOM(outerHTML, { url: finalUrl });
const fallbackBody = fallbackDoc.window.document;
fallbackBody.querySelectorAll("script, style, noscript, nav, header, footer, aside").forEach((el) => el.remove());
const main = fallbackBody.querySelector("main, article, [role='main'], .content, #content") || fallbackBody.body;
const fallbackHtml = main?.innerHTML || "";
if (fallbackHtml.trim().length > 100) {
content = htmlToMarkdown(fallbackHtml);
} else {
content = "(Could not extract content)";
}
}
console.log(`URL: ${finalUrl}`);
if (article?.title) console.log(`Title: ${article.title}`);
console.log("");
console.log(content);
process.exit(0);