Re-add pi-web-access
This commit is contained in:
184
pi/files/agent/extensions/pi-web-access/pdf-extract.ts
Normal file
184
pi/files/agent/extensions/pi-web-access/pdf-extract.ts
Normal file
@@ -0,0 +1,184 @@
|
||||
/**
|
||||
* PDF Content Extractor
|
||||
*
|
||||
* Extracts text from PDF files and saves to markdown.
|
||||
* Uses unpdf (pdfjs-dist wrapper) for text extraction.
|
||||
*/
|
||||
|
||||
import { getDocumentProxy } from "unpdf";
|
||||
import { writeFile, mkdir } from "node:fs/promises";
|
||||
import { join, basename } from "node:path";
|
||||
import { homedir } from "node:os";
|
||||
|
||||
export interface PDFExtractResult {
|
||||
title: string;
|
||||
pages: number;
|
||||
chars: number;
|
||||
outputPath: string;
|
||||
}
|
||||
|
||||
export interface PDFExtractOptions {
|
||||
maxPages?: number;
|
||||
outputDir?: string;
|
||||
filename?: string;
|
||||
}
|
||||
|
||||
const DEFAULT_MAX_PAGES = 100;
|
||||
const DEFAULT_OUTPUT_DIR = join(homedir(), "Downloads");
|
||||
|
||||
/**
|
||||
* Extract text from a PDF buffer and save to markdown file
|
||||
*/
|
||||
export async function extractPDFToMarkdown(
|
||||
buffer: ArrayBuffer,
|
||||
url: string,
|
||||
options: PDFExtractOptions = {}
|
||||
): Promise<PDFExtractResult> {
|
||||
const {
|
||||
maxPages = DEFAULT_MAX_PAGES,
|
||||
outputDir = DEFAULT_OUTPUT_DIR,
|
||||
filename
|
||||
} = options;
|
||||
|
||||
const pdf = await getDocumentProxy(new Uint8Array(buffer));
|
||||
const metadata = await pdf.getMetadata();
|
||||
|
||||
// Extract title from metadata or URL
|
||||
const metaTitle = metadata.info?.Title as string | undefined;
|
||||
const urlTitle = extractTitleFromURL(url);
|
||||
const title = metaTitle?.trim() || urlTitle;
|
||||
|
||||
// Determine pages to extract
|
||||
const pagesToExtract = Math.min(pdf.numPages, maxPages);
|
||||
const truncated = pdf.numPages > maxPages;
|
||||
|
||||
// Extract text page by page for better structure
|
||||
const pages: { pageNum: number; text: string }[] = [];
|
||||
for (let i = 1; i <= pagesToExtract; i++) {
|
||||
const page = await pdf.getPage(i);
|
||||
const textContent = await page.getTextContent();
|
||||
const pageText = textContent.items
|
||||
.map((item: unknown) => {
|
||||
const textItem = item as { str?: string };
|
||||
return textItem.str || "";
|
||||
})
|
||||
.join(" ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
|
||||
if (pageText) {
|
||||
pages.push({ pageNum: i, text: pageText });
|
||||
}
|
||||
}
|
||||
|
||||
// Build markdown content
|
||||
const lines: string[] = [];
|
||||
|
||||
// Header with metadata
|
||||
lines.push(`# ${title}`);
|
||||
lines.push("");
|
||||
lines.push(`> Source: ${url}`);
|
||||
lines.push(`> Pages: ${pdf.numPages}${truncated ? ` (extracted first ${pagesToExtract})` : ""}`);
|
||||
if (metadata.info?.Author) {
|
||||
lines.push(`> Author: ${metadata.info.Author}`);
|
||||
}
|
||||
lines.push("");
|
||||
lines.push("---");
|
||||
lines.push("");
|
||||
|
||||
// Content with page markers
|
||||
for (let i = 0; i < pages.length; i++) {
|
||||
if (i > 0) {
|
||||
lines.push("");
|
||||
lines.push(`<!-- Page ${pages[i].pageNum} -->`);
|
||||
lines.push("");
|
||||
}
|
||||
lines.push(pages[i].text);
|
||||
}
|
||||
|
||||
if (truncated) {
|
||||
lines.push("");
|
||||
lines.push("---");
|
||||
lines.push("");
|
||||
lines.push(`*[Truncated: Only first ${pagesToExtract} of ${pdf.numPages} pages extracted]*`);
|
||||
}
|
||||
|
||||
const content = lines.join("\n");
|
||||
|
||||
// Generate output filename
|
||||
const outputFilename = filename || sanitizeFilename(title) + ".md";
|
||||
const outputPath = join(outputDir, outputFilename);
|
||||
|
||||
// Ensure output directory exists
|
||||
await mkdir(outputDir, { recursive: true });
|
||||
|
||||
// Write file
|
||||
await writeFile(outputPath, content, "utf-8");
|
||||
|
||||
return {
|
||||
title,
|
||||
pages: pdf.numPages,
|
||||
chars: content.length,
|
||||
outputPath,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract a reasonable title from URL
|
||||
*/
|
||||
function extractTitleFromURL(url: string): string {
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
const pathname = urlObj.pathname;
|
||||
|
||||
// Get filename without extension
|
||||
let filename = basename(pathname, ".pdf");
|
||||
|
||||
// Handle arxiv URLs: /pdf/1706.03762 → "arxiv-1706.03762"
|
||||
if (urlObj.hostname.includes("arxiv.org")) {
|
||||
const match = pathname.match(/\/(?:pdf|abs)\/(\d+\.\d+)/);
|
||||
if (match) {
|
||||
filename = `arxiv-${match[1]}`;
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up filename
|
||||
filename = filename
|
||||
.replace(/[_-]+/g, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
|
||||
return filename || "document";
|
||||
} catch {
|
||||
return "document";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sanitize string for use as filename
|
||||
*/
|
||||
function sanitizeFilename(name: string): string {
|
||||
return name
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9\s-]/g, "")
|
||||
.replace(/\s+/g, "-")
|
||||
.replace(/-+/g, "-")
|
||||
.slice(0, 100)
|
||||
.replace(/^-|-$/g, "")
|
||||
|| "document";
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if URL or content-type indicates a PDF
|
||||
*/
|
||||
export function isPDF(url: string, contentType?: string): boolean {
|
||||
if (contentType?.includes("application/pdf")) {
|
||||
return true;
|
||||
}
|
||||
try {
|
||||
const urlObj = new URL(url);
|
||||
return urlObj.pathname.toLowerCase().endsWith(".pdf");
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user