Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 9x 9x 9x 9x 9x 9x 9x 9x 9x 9x 6x 6x 5x 5x 5x 5x 5x 5x 6x 1x 1x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 6x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 3x 3x 3x 3x 3x 1x 1x 1x | /**
* AgentKits — PDF Parser Module
*
* Extract text from PDF buffers, split into pages/chunks.
*
* Usage:
* import { parsePDF } from 'agentkits/pdf-parser';
* const result = await parsePDF(buffer);
* console.log(result.pages);
*/
// ── Types ──────────────────────────────────────────────────────────
export interface PDFPage {
pageNumber: number;
text: string;
charCount: number;
}
export interface PDFParseResult {
text: string;
pages: PDFPage[];
totalPages: number;
totalChars: number;
metadata?: Record<string, string>;
}
export interface PDFParserConfig {
/** Max pages to parse (default: all) */
maxPages?: number;
/** Remove excessive whitespace (default: true) */
normalizeWhitespace?: boolean;
}
// ── Simple built-in PDF text extractor ─────────────────────────────
/**
* Minimal PDF text extraction — extracts text from stream objects.
* For production use with complex PDFs, install `pdf-parse` as optional dep.
*/
function extractTextFromPDFBuffer(buffer: Buffer): { pages: string[]; metadata: Record<string, string> } {
const content = buffer.toString('latin1');
const pages: string[] = [];
const metadata: Record<string, string> = {};
// Extract metadata from Info dictionary
const titleMatch = content.match(/\/Title\s*\(([^)]*)\)/);
if (titleMatch) metadata.title = titleMatch[1];
const authorMatch = content.match(/\/Author\s*\(([^)]*)\)/);
if (authorMatch) metadata.author = authorMatch[1];
// Extract text between BT...ET blocks (text objects)
const textBlocks: string[] = [];
const btEtRegex = /BT\s([\s\S]*?)ET/g;
let match: RegExpExecArray | null;
while ((match = btEtRegex.exec(content)) !== null) {
const block = match[1];
// Extract text from Tj, TJ, ' and " operators
const tjRegex = /\(([^)]*)\)\s*Tj/g;
let tjMatch: RegExpExecArray | null;
while ((tjMatch = tjRegex.exec(block)) !== null) {
textBlocks.push(tjMatch[1]);
}
// TJ array
const tjArrayRegex = /\[([^\]]*)\]\s*TJ/g;
let arrMatch: RegExpExecArray | null;
while ((arrMatch = tjArrayRegex.exec(block)) !== null) {
const inner = arrMatch[1];
const strRegex = /\(([^)]*)\)/g;
let strMatch: RegExpExecArray | null;
while ((strMatch = strRegex.exec(inner)) !== null) {
textBlocks.push(strMatch[1]);
}
}
}
// Simple page splitting by page markers
const pageMarkers = content.split(/\/Type\s*\/Page[^s]/);
if (pageMarkers.length > 1) {
// Distribute text blocks across pages roughly
const blocksPerPage = Math.max(1, Math.ceil(textBlocks.length / (pageMarkers.length - 1)));
for (let i = 0; i < pageMarkers.length - 1; i++) {
const start = i * blocksPerPage;
const end = Math.min(start + blocksPerPage, textBlocks.length);
pages.push(textBlocks.slice(start, end).join(' '));
}
} else {
pages.push(textBlocks.join(' '));
}
return { pages, metadata };
}
function normalizeWS(text: string): string {
return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
}
// ── Main API ───────────────────────────────────────────────────────
/**
* Parse PDF from a Buffer.
*
* Tries `pdf-parse` if installed, falls back to built-in extractor.
*/
export async function parsePDF(buffer: Buffer, config: PDFParserConfig = {}): Promise<PDFParseResult> {
const { maxPages, normalizeWhitespace = true } = config;
let rawPages: string[];
let metadata: Record<string, string> = {};
try {
// Try optional pdf-parse dependency
// @ts-ignore — pdf-parse is an optional dependency
const pdfParse = (await import('pdf-parse')).default;
const data = await pdfParse(buffer, { max: maxPages ?? 0 });
metadata = {
...(data.info?.Title ? { title: data.info.Title } : {}),
...(data.info?.Author ? { author: data.info.Author } : {}),
};
// pdf-parse doesn't give per-page easily; split by form feed
rawPages = data.text.split('\f').filter((p: string) => p.trim());
if (rawPages.length === 0) rawPages = [data.text];
} catch {
// Fallback to built-in
const result = extractTextFromPDFBuffer(buffer);
rawPages = result.pages;
metadata = result.metadata;
}
if (maxPages && rawPages.length > maxPages) {
rawPages = rawPages.slice(0, maxPages);
}
const pages: PDFPage[] = rawPages.map((text, i) => {
const cleaned = normalizeWhitespace ? normalizeWS(text) : text;
return { pageNumber: i + 1, text: cleaned, charCount: cleaned.length };
});
const fullText = pages.map(p => p.text).join('\n\n');
return {
text: fullText,
pages,
totalPages: pages.length,
totalChars: fullText.length,
metadata,
};
}
/**
* Parse PDF and split into chunks of roughly `chunkSize` characters.
*/
export async function parsePDFChunked(
buffer: Buffer,
options: PDFParserConfig & { chunkSize?: number; overlap?: number } = {}
): Promise<{ chunks: string[]; metadata?: Record<string, string> }> {
const { chunkSize = 1000, overlap = 200, ...parserConfig } = options;
const result = await parsePDF(buffer, parserConfig);
const chunks: string[] = [];
let offset = 0;
const text = result.text;
while (offset < text.length) {
const end = Math.min(offset + chunkSize, text.length);
chunks.push(text.slice(offset, end));
offset = end - overlap;
if (offset >= text.length - overlap) break;
}
// Ensure last chunk
if (chunks.length === 0 && text.length > 0) chunks.push(text);
return { chunks, metadata: result.metadata };
}
|