dotfiles/pi/files/agent/extensions/pi-web-access/pdf-extract.ts

/**
 * PDF Content Extractor
 *
 * Extracts text from PDF files and saves to markdown.
 * Uses unpdf (pdfjs-dist wrapper) for text extraction.
 */

import { getDocumentProxy } from "unpdf";
import { writeFile, mkdir } from "node:fs/promises";
import { join, basename } from "node:path";
import { homedir } from "node:os";

export interface PDFExtractResult {
  title: string;
  pages: number;
  chars: number;
  outputPath: string;
}

export interface PDFExtractOptions {
  maxPages?: number;
  outputDir?: string;
  filename?: string;
}

const DEFAULT_MAX_PAGES = 100;
const DEFAULT_OUTPUT_DIR = join(homedir(), "Downloads");

/**
 * Extract text from a PDF buffer and save to markdown file
 */
export async function extractPDFToMarkdown(
  buffer: ArrayBuffer,
  url: string,
  options: PDFExtractOptions = {}
): Promise<PDFExtractResult> {
  const {
    maxPages = DEFAULT_MAX_PAGES,
    outputDir = DEFAULT_OUTPUT_DIR,
    filename
  } = options;

  const pdf = await getDocumentProxy(new Uint8Array(buffer));
  const metadata = await pdf.getMetadata();

  // Extract title from metadata or URL
  const metaTitle = metadata.info?.Title as string | undefined;
  const urlTitle = extractTitleFromURL(url);
  const title = metaTitle?.trim() || urlTitle;

  // Determine pages to extract
  const pagesToExtract = Math.min(pdf.numPages, maxPages);
  const truncated = pdf.numPages > maxPages;

  // Extract text page by page for better structure
  const pages: { pageNum: number; text: string }[] = [];
  for (let i = 1; i <= pagesToExtract; i++) {
    const page = await pdf.getPage(i);
    const textContent = await page.getTextContent();
    const pageText = textContent.items
      .map((item: unknown) => {
        const textItem = item as { str?: string };
        return textItem.str || "";
      })
      .join(" ")
      .replace(/\s+/g, " ")
      .trim();

    if (pageText) {
      pages.push({ pageNum: i, text: pageText });
    }
  }

  // Build markdown content
  const lines: string[] = [];

  // Header with metadata
  lines.push(`# ${title}`);
  lines.push("");
  lines.push(`> Source: ${url}`);
  lines.push(`> Pages: ${pdf.numPages}${truncated ? ` (extracted first ${pagesToExtract})` : ""}`);
  if (metadata.info?.Author) {
    lines.push(`> Author: ${metadata.info.Author}`);
  }
  lines.push("");
  lines.push("---");
  lines.push("");

  // Content with page markers
  for (let i = 0; i < pages.length; i++) {
    if (i > 0) {
      lines.push("");
      lines.push(`<!-- Page ${pages[i].pageNum} -->`);
      lines.push("");
    }
    lines.push(pages[i].text);
  }

  if (truncated) {
    lines.push("");
    lines.push("---");
    lines.push("");
    lines.push(`*[Truncated: Only first ${pagesToExtract} of ${pdf.numPages} pages extracted]*`);
  }

  const content = lines.join("\n");

  // Generate output filename
  const outputFilename = filename || sanitizeFilename(title) + ".md";
  const outputPath = join(outputDir, outputFilename);

  // Ensure output directory exists
  await mkdir(outputDir, { recursive: true });

  // Write file
  await writeFile(outputPath, content, "utf-8");

  return {
    title,
    pages: pdf.numPages,
    chars: content.length,
    outputPath,
  };
}

/**
 * Extract a reasonable title from URL
 */
function extractTitleFromURL(url: string): string {
  try {
    const urlObj = new URL(url);
    const pathname = urlObj.pathname;

    // Get filename without extension
    let filename = basename(pathname, ".pdf");

    // Handle arxiv URLs: /pdf/1706.03762 → "arxiv-1706.03762"
    if (urlObj.hostname.includes("arxiv.org")) {
      const match = pathname.match(/\/(?:pdf|abs)\/(\d+\.\d+)/);
      if (match) {
        filename = `arxiv-${match[1]}`;
      }
    }

    // Clean up filename
    filename = filename
      .replace(/[_-]+/g, " ")
      .replace(/\s+/g, " ")
      .trim();

    return filename || "document";
  } catch {
    return "document";
  }
}

/**
 * Sanitize string for use as filename
 */
function sanitizeFilename(name: string): string {
  return name
    .toLowerCase()
    .replace(/[^a-z0-9\s-]/g, "")
    .replace(/\s+/g, "-")
    .replace(/-+/g, "-")
    .slice(0, 100)
    .replace(/^-|-$/g, "")
    || "document";
}

/**
 * Check if URL or content-type indicates a PDF
 */
export function isPDF(url: string, contentType?: string): boolean {
  if (contentType?.includes("application/pdf")) {
    return true;
  }
  try {
    const urlObj = new URL(url);
    return urlObj.pathname.toLowerCase().endsWith(".pdf");
  } catch {
    return false;
  }
}