Files
dotfiles/pi/files/agent/extensions/pi-web-access/pdf-extract.ts
2026-02-19 22:23:48 +00:00

185 lines
4.5 KiB
TypeScript

/**
* PDF Content Extractor
*
* Extracts text from PDF files and saves to markdown.
* Uses unpdf (pdfjs-dist wrapper) for text extraction.
*/
import { getDocumentProxy } from "unpdf";
import { writeFile, mkdir } from "node:fs/promises";
import { join, basename } from "node:path";
import { homedir } from "node:os";
export interface PDFExtractResult {
title: string;
pages: number;
chars: number;
outputPath: string;
}
export interface PDFExtractOptions {
maxPages?: number;
outputDir?: string;
filename?: string;
}
const DEFAULT_MAX_PAGES = 100;
const DEFAULT_OUTPUT_DIR = join(homedir(), "Downloads");
/**
* Extract text from a PDF buffer and save to markdown file
*/
export async function extractPDFToMarkdown(
buffer: ArrayBuffer,
url: string,
options: PDFExtractOptions = {}
): Promise<PDFExtractResult> {
const {
maxPages = DEFAULT_MAX_PAGES,
outputDir = DEFAULT_OUTPUT_DIR,
filename
} = options;
const pdf = await getDocumentProxy(new Uint8Array(buffer));
const metadata = await pdf.getMetadata();
// Extract title from metadata or URL
const metaTitle = metadata.info?.Title as string | undefined;
const urlTitle = extractTitleFromURL(url);
const title = metaTitle?.trim() || urlTitle;
// Determine pages to extract
const pagesToExtract = Math.min(pdf.numPages, maxPages);
const truncated = pdf.numPages > maxPages;
// Extract text page by page for better structure
const pages: { pageNum: number; text: string }[] = [];
for (let i = 1; i <= pagesToExtract; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item: unknown) => {
const textItem = item as { str?: string };
return textItem.str || "";
})
.join(" ")
.replace(/\s+/g, " ")
.trim();
if (pageText) {
pages.push({ pageNum: i, text: pageText });
}
}
// Build markdown content
const lines: string[] = [];
// Header with metadata
lines.push(`# ${title}`);
lines.push("");
lines.push(`> Source: ${url}`);
lines.push(`> Pages: ${pdf.numPages}${truncated ? ` (extracted first ${pagesToExtract})` : ""}`);
if (metadata.info?.Author) {
lines.push(`> Author: ${metadata.info.Author}`);
}
lines.push("");
lines.push("---");
lines.push("");
// Content with page markers
for (let i = 0; i < pages.length; i++) {
if (i > 0) {
lines.push("");
lines.push(`<!-- Page ${pages[i].pageNum} -->`);
lines.push("");
}
lines.push(pages[i].text);
}
if (truncated) {
lines.push("");
lines.push("---");
lines.push("");
lines.push(`*[Truncated: Only first ${pagesToExtract} of ${pdf.numPages} pages extracted]*`);
}
const content = lines.join("\n");
// Generate output filename
const outputFilename = filename || sanitizeFilename(title) + ".md";
const outputPath = join(outputDir, outputFilename);
// Ensure output directory exists
await mkdir(outputDir, { recursive: true });
// Write file
await writeFile(outputPath, content, "utf-8");
return {
title,
pages: pdf.numPages,
chars: content.length,
outputPath,
};
}
/**
* Extract a reasonable title from URL
*/
function extractTitleFromURL(url: string): string {
try {
const urlObj = new URL(url);
const pathname = urlObj.pathname;
// Get filename without extension
let filename = basename(pathname, ".pdf");
// Handle arxiv URLs: /pdf/1706.03762 → "arxiv-1706.03762"
if (urlObj.hostname.includes("arxiv.org")) {
const match = pathname.match(/\/(?:pdf|abs)\/(\d+\.\d+)/);
if (match) {
filename = `arxiv-${match[1]}`;
}
}
// Clean up filename
filename = filename
.replace(/[_-]+/g, " ")
.replace(/\s+/g, " ")
.trim();
return filename || "document";
} catch {
return "document";
}
}
/**
* Sanitize string for use as filename
*/
function sanitizeFilename(name: string): string {
return name
.toLowerCase()
.replace(/[^a-z0-9\s-]/g, "")
.replace(/\s+/g, "-")
.replace(/-+/g, "-")
.slice(0, 100)
.replace(/^-|-$/g, "")
|| "document";
}
/**
* Check if URL or content-type indicates a PDF
*/
export function isPDF(url: string, contentType?: string): boolean {
if (contentType?.includes("application/pdf")) {
return true;
}
try {
const urlObj = new URL(url);
return urlObj.pathname.toLowerCase().endsWith(".pdf");
} catch {
return false;
}
}