185 lines
4.5 KiB
TypeScript
185 lines
4.5 KiB
TypeScript
/**
|
|
* PDF Content Extractor
|
|
*
|
|
* Extracts text from PDF files and saves to markdown.
|
|
* Uses unpdf (pdfjs-dist wrapper) for text extraction.
|
|
*/
|
|
|
|
import { getDocumentProxy } from "unpdf";
|
|
import { writeFile, mkdir } from "node:fs/promises";
|
|
import { join, basename } from "node:path";
|
|
import { homedir } from "node:os";
|
|
|
|
export interface PDFExtractResult {
|
|
title: string;
|
|
pages: number;
|
|
chars: number;
|
|
outputPath: string;
|
|
}
|
|
|
|
export interface PDFExtractOptions {
|
|
maxPages?: number;
|
|
outputDir?: string;
|
|
filename?: string;
|
|
}
|
|
|
|
const DEFAULT_MAX_PAGES = 100;
|
|
const DEFAULT_OUTPUT_DIR = join(homedir(), "Downloads");
|
|
|
|
/**
|
|
* Extract text from a PDF buffer and save to markdown file
|
|
*/
|
|
export async function extractPDFToMarkdown(
|
|
buffer: ArrayBuffer,
|
|
url: string,
|
|
options: PDFExtractOptions = {}
|
|
): Promise<PDFExtractResult> {
|
|
const {
|
|
maxPages = DEFAULT_MAX_PAGES,
|
|
outputDir = DEFAULT_OUTPUT_DIR,
|
|
filename
|
|
} = options;
|
|
|
|
const pdf = await getDocumentProxy(new Uint8Array(buffer));
|
|
const metadata = await pdf.getMetadata();
|
|
|
|
// Extract title from metadata or URL
|
|
const metaTitle = metadata.info?.Title as string | undefined;
|
|
const urlTitle = extractTitleFromURL(url);
|
|
const title = metaTitle?.trim() || urlTitle;
|
|
|
|
// Determine pages to extract
|
|
const pagesToExtract = Math.min(pdf.numPages, maxPages);
|
|
const truncated = pdf.numPages > maxPages;
|
|
|
|
// Extract text page by page for better structure
|
|
const pages: { pageNum: number; text: string }[] = [];
|
|
for (let i = 1; i <= pagesToExtract; i++) {
|
|
const page = await pdf.getPage(i);
|
|
const textContent = await page.getTextContent();
|
|
const pageText = textContent.items
|
|
.map((item: unknown) => {
|
|
const textItem = item as { str?: string };
|
|
return textItem.str || "";
|
|
})
|
|
.join(" ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
|
|
if (pageText) {
|
|
pages.push({ pageNum: i, text: pageText });
|
|
}
|
|
}
|
|
|
|
// Build markdown content
|
|
const lines: string[] = [];
|
|
|
|
// Header with metadata
|
|
lines.push(`# ${title}`);
|
|
lines.push("");
|
|
lines.push(`> Source: ${url}`);
|
|
lines.push(`> Pages: ${pdf.numPages}${truncated ? ` (extracted first ${pagesToExtract})` : ""}`);
|
|
if (metadata.info?.Author) {
|
|
lines.push(`> Author: ${metadata.info.Author}`);
|
|
}
|
|
lines.push("");
|
|
lines.push("---");
|
|
lines.push("");
|
|
|
|
// Content with page markers
|
|
for (let i = 0; i < pages.length; i++) {
|
|
if (i > 0) {
|
|
lines.push("");
|
|
lines.push(`<!-- Page ${pages[i].pageNum} -->`);
|
|
lines.push("");
|
|
}
|
|
lines.push(pages[i].text);
|
|
}
|
|
|
|
if (truncated) {
|
|
lines.push("");
|
|
lines.push("---");
|
|
lines.push("");
|
|
lines.push(`*[Truncated: Only first ${pagesToExtract} of ${pdf.numPages} pages extracted]*`);
|
|
}
|
|
|
|
const content = lines.join("\n");
|
|
|
|
// Generate output filename
|
|
const outputFilename = filename || sanitizeFilename(title) + ".md";
|
|
const outputPath = join(outputDir, outputFilename);
|
|
|
|
// Ensure output directory exists
|
|
await mkdir(outputDir, { recursive: true });
|
|
|
|
// Write file
|
|
await writeFile(outputPath, content, "utf-8");
|
|
|
|
return {
|
|
title,
|
|
pages: pdf.numPages,
|
|
chars: content.length,
|
|
outputPath,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract a reasonable title from URL
|
|
*/
|
|
function extractTitleFromURL(url: string): string {
|
|
try {
|
|
const urlObj = new URL(url);
|
|
const pathname = urlObj.pathname;
|
|
|
|
// Get filename without extension
|
|
let filename = basename(pathname, ".pdf");
|
|
|
|
// Handle arxiv URLs: /pdf/1706.03762 → "arxiv-1706.03762"
|
|
if (urlObj.hostname.includes("arxiv.org")) {
|
|
const match = pathname.match(/\/(?:pdf|abs)\/(\d+\.\d+)/);
|
|
if (match) {
|
|
filename = `arxiv-${match[1]}`;
|
|
}
|
|
}
|
|
|
|
// Clean up filename
|
|
filename = filename
|
|
.replace(/[_-]+/g, " ")
|
|
.replace(/\s+/g, " ")
|
|
.trim();
|
|
|
|
return filename || "document";
|
|
} catch {
|
|
return "document";
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Sanitize string for use as filename
|
|
*/
|
|
function sanitizeFilename(name: string): string {
|
|
return name
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9\s-]/g, "")
|
|
.replace(/\s+/g, "-")
|
|
.replace(/-+/g, "-")
|
|
.slice(0, 100)
|
|
.replace(/^-|-$/g, "")
|
|
|| "document";
|
|
}
|
|
|
|
/**
|
|
* Check if URL or content-type indicates a PDF
|
|
*/
|
|
export function isPDF(url: string, contentType?: string): boolean {
|
|
if (contentType?.includes("application/pdf")) {
|
|
return true;
|
|
}
|
|
try {
|
|
const urlObj = new URL(url);
|
|
return urlObj.pathname.toLowerCase().endsWith(".pdf");
|
|
} catch {
|
|
return false;
|
|
}
|
|
}
|