skills
This commit is contained in:
@@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
import puppeteer from "puppeteer-core";
|
||||
import { Readability } from "@mozilla/readability";
|
||||
import { JSDOM } from "jsdom";
|
||||
import TurndownService from "turndown";
|
||||
import { gfm } from "turndown-plugin-gfm";
|
||||
|
||||
// Global timeout - exit if script takes too long
|
||||
const TIMEOUT = 30000;
|
||||
const timeoutId = setTimeout(() => {
|
||||
console.error("✗ Timeout after 30s");
|
||||
process.exit(1);
|
||||
}, TIMEOUT).unref();
|
||||
|
||||
const url = process.argv[2];
|
||||
|
||||
if (!url) {
|
||||
console.log("Usage: browser-content.js <url>");
|
||||
console.log("\nExtracts readable content from a URL as markdown.");
|
||||
console.log("\nExamples:");
|
||||
console.log(" browser-content.js https://example.com");
|
||||
console.log(" browser-content.js https://en.wikipedia.org/wiki/Rust_(programming_language)");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const b = await Promise.race([
|
||||
puppeteer.connect({
|
||||
browserURL: "http://localhost:9222",
|
||||
defaultViewport: null,
|
||||
}),
|
||||
new Promise((_, reject) => setTimeout(() => reject(new Error("timeout")), 5000)),
|
||||
]).catch((e) => {
|
||||
console.error("✗ Could not connect to browser:", e.message);
|
||||
console.error(" Run: browser-start.js");
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
const p = (await b.pages()).at(-1);
|
||||
if (!p) {
|
||||
console.error("✗ No active tab found");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
await Promise.race([
|
||||
p.goto(url, { waitUntil: "networkidle2" }),
|
||||
new Promise((r) => setTimeout(r, 10000)),
|
||||
]).catch(() => {});
|
||||
|
||||
// Get HTML via CDP (works even with TrustedScriptURL restrictions)
|
||||
const client = await p.createCDPSession();
|
||||
const { root } = await client.send("DOM.getDocument", { depth: -1, pierce: true });
|
||||
const { outerHTML } = await client.send("DOM.getOuterHTML", { nodeId: root.nodeId });
|
||||
await client.detach();
|
||||
|
||||
const finalUrl = p.url();
|
||||
|
||||
// Extract with Readability
|
||||
const doc = new JSDOM(outerHTML, { url: finalUrl });
|
||||
const reader = new Readability(doc.window.document);
|
||||
const article = reader.parse();
|
||||
|
||||
// Convert to markdown
|
||||
function htmlToMarkdown(html) {
|
||||
const turndown = new TurndownService({ headingStyle: "atx", codeBlockStyle: "fenced" });
|
||||
turndown.use(gfm);
|
||||
turndown.addRule("removeEmptyLinks", {
|
||||
filter: (node) => node.nodeName === "A" && !node.textContent?.trim(),
|
||||
replacement: () => "",
|
||||
});
|
||||
return turndown
|
||||
.turndown(html)
|
||||
.replace(/\[\\?\[\s*\\?\]\]\([^)]*\)/g, "")
|
||||
.replace(/ +/g, " ")
|
||||
.replace(/\s+,/g, ",")
|
||||
.replace(/\s+\./g, ".")
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
let content;
|
||||
if (article && article.content) {
|
||||
content = htmlToMarkdown(article.content);
|
||||
} else {
|
||||
// Fallback
|
||||
const fallbackDoc = new JSDOM(outerHTML, { url: finalUrl });
|
||||
const fallbackBody = fallbackDoc.window.document;
|
||||
fallbackBody.querySelectorAll("script, style, noscript, nav, header, footer, aside").forEach((el) => el.remove());
|
||||
const main = fallbackBody.querySelector("main, article, [role='main'], .content, #content") || fallbackBody.body;
|
||||
const fallbackHtml = main?.innerHTML || "";
|
||||
if (fallbackHtml.trim().length > 100) {
|
||||
content = htmlToMarkdown(fallbackHtml);
|
||||
} else {
|
||||
content = "(Could not extract content)";
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`URL: ${finalUrl}`);
|
||||
if (article?.title) console.log(`Title: ${article.title}`);
|
||||
console.log("");
|
||||
console.log(content);
|
||||
|
||||
process.exit(0);
|
||||
Reference in New Issue
Block a user