Re-add pi-web-access

2026-02-19 22:23:48 +00:00
parent c242a0ca53
commit 774492f279
31 changed files with 8666 additions and 1 deletions
--- a/pi/files/agent/extensions/pi-web-access/pdf-extract.ts
+++ b/pi/files/agent/extensions/pi-web-access/pdf-extract.ts
@@ -0,0 +1,184 @@
+/**
+ * PDF Content Extractor
+ * 
+ * Extracts text from PDF files and saves to markdown.
+ * Uses unpdf (pdfjs-dist wrapper) for text extraction.
+ */
+
+import { getDocumentProxy } from "unpdf";
+import { writeFile, mkdir } from "node:fs/promises";
+import { join, basename } from "node:path";
+import { homedir } from "node:os";
+
+export interface PDFExtractResult {
+  title: string;
+  pages: number;
+  chars: number;
+  outputPath: string;
+}
+
+export interface PDFExtractOptions {
+  maxPages?: number;
+  outputDir?: string;
+  filename?: string;
+}
+
+const DEFAULT_MAX_PAGES = 100;
+const DEFAULT_OUTPUT_DIR = join(homedir(), "Downloads");
+
+/**
+ * Extract text from a PDF buffer and save to markdown file
+ */
+export async function extractPDFToMarkdown(
+  buffer: ArrayBuffer,
+  url: string,
+  options: PDFExtractOptions = {}
+): Promise<PDFExtractResult> {
+  const { 
+    maxPages = DEFAULT_MAX_PAGES, 
+    outputDir = DEFAULT_OUTPUT_DIR,
+    filename 
+  } = options;
+
+  const pdf = await getDocumentProxy(new Uint8Array(buffer));
+  const metadata = await pdf.getMetadata();
+  
+  // Extract title from metadata or URL
+  const metaTitle = metadata.info?.Title as string | undefined;
+  const urlTitle = extractTitleFromURL(url);
+  const title = metaTitle?.trim() || urlTitle;
+
+  // Determine pages to extract
+  const pagesToExtract = Math.min(pdf.numPages, maxPages);
+  const truncated = pdf.numPages > maxPages;
+
+  // Extract text page by page for better structure
+  const pages: { pageNum: number; text: string }[] = [];
+  for (let i = 1; i <= pagesToExtract; i++) {
+    const page = await pdf.getPage(i);
+    const textContent = await page.getTextContent();
+    const pageText = textContent.items
+      .map((item: unknown) => {
+        const textItem = item as { str?: string };
+        return textItem.str || "";
+      })
+      .join(" ")
+      .replace(/\s+/g, " ")
+      .trim();
+    
+    if (pageText) {
+      pages.push({ pageNum: i, text: pageText });
+    }
+  }
+
+  // Build markdown content
+  const lines: string[] = [];
+  
+  // Header with metadata
+  lines.push(`# ${title}`);
+  lines.push("");
+  lines.push(`> Source: ${url}`);
+  lines.push(`> Pages: ${pdf.numPages}${truncated ? ` (extracted first ${pagesToExtract})` : ""}`);
+  if (metadata.info?.Author) {
+    lines.push(`> Author: ${metadata.info.Author}`);
+  }
+  lines.push("");
+  lines.push("---");
+  lines.push("");
+
+  // Content with page markers
+  for (let i = 0; i < pages.length; i++) {
+    if (i > 0) {
+      lines.push("");
+      lines.push(`<!-- Page ${pages[i].pageNum} -->`);
+      lines.push("");
+    }
+    lines.push(pages[i].text);
+  }
+
+  if (truncated) {
+    lines.push("");
+    lines.push("---");
+    lines.push("");
+    lines.push(`*[Truncated: Only first ${pagesToExtract} of ${pdf.numPages} pages extracted]*`);
+  }
+
+  const content = lines.join("\n");
+
+  // Generate output filename
+  const outputFilename = filename || sanitizeFilename(title) + ".md";
+  const outputPath = join(outputDir, outputFilename);
+
+  // Ensure output directory exists
+  await mkdir(outputDir, { recursive: true });
+
+  // Write file
+  await writeFile(outputPath, content, "utf-8");
+
+  return {
+    title,
+    pages: pdf.numPages,
+    chars: content.length,
+    outputPath,
+  };
+}
+
+/**
+ * Extract a reasonable title from URL
+ */
+function extractTitleFromURL(url: string): string {
+  try {
+    const urlObj = new URL(url);
+    const pathname = urlObj.pathname;
+    
+    // Get filename without extension
+    let filename = basename(pathname, ".pdf");
+    
+    // Handle arxiv URLs: /pdf/1706.03762 → "arxiv-1706.03762"
+    if (urlObj.hostname.includes("arxiv.org")) {
+      const match = pathname.match(/\/(?:pdf|abs)\/(\d+\.\d+)/);
+      if (match) {
+        filename = `arxiv-${match[1]}`;
+      }
+    }
+    
+    // Clean up filename
+    filename = filename
+      .replace(/[_-]+/g, " ")
+      .replace(/\s+/g, " ")
+      .trim();
+    
+    return filename || "document";
+  } catch {
+    return "document";
+  }
+}
+
+/**
+ * Sanitize string for use as filename
+ */
+function sanitizeFilename(name: string): string {
+  return name
+    .toLowerCase()
+    .replace(/[^a-z0-9\s-]/g, "")
+    .replace(/\s+/g, "-")
+    .replace(/-+/g, "-")
+    .slice(0, 100)
+    .replace(/^-|-$/g, "")
+    || "document";
+}
+
+/**
+ * Check if URL or content-type indicates a PDF
+ */
+export function isPDF(url: string, contentType?: string): boolean {
+  if (contentType?.includes("application/pdf")) {
+    return true;
+  }
+  try {
+    const urlObj = new URL(url);
+    return urlObj.pathname.toLowerCase().endsWith(".pdf");
+  } catch {
+    return false;
+  }
+}