All files / src/modules pdf-parser.ts

89.69% Statements 87/97
51.85% Branches 14/27
100% Functions 4/4
89.69% Lines 87/97

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176                                                                                6x 6x 6x 6x     6x 6x 6x 6x     6x 6x 6x   6x 9x   9x 9x 9x 9x 9x   9x 9x 9x               9x     6x 6x   5x 5x 5x 5x 5x 5x 6x 1x 1x   6x 6x   6x 6x 6x                 6x 6x   6x 6x   6x     6x 6x   6x 6x 6x   6x 6x 6x   6x 6x 6x 6x   6x       6x 6x 6x 6x   6x   6x 6x 6x 6x 6x 6x 6x 6x         1x 1x 1x 1x 1x 1x   1x 1x 1x   1x 3x 3x 3x 3x 3x   1x   1x 1x  
/**
 * AgentKits — PDF Parser Module
 *
 * Extract text from PDF buffers, split into pages/chunks.
 *
 * Usage:
 *   import { parsePDF } from 'agentkits/pdf-parser';
 *   const result = await parsePDF(buffer);
 *   console.log(result.pages);
 */
 
// ── Types ──────────────────────────────────────────────────────────
 
export interface PDFPage {
  pageNumber: number;
  text: string;
  charCount: number;
}
 
export interface PDFParseResult {
  text: string;
  pages: PDFPage[];
  totalPages: number;
  totalChars: number;
  metadata?: Record<string, string>;
}
 
export interface PDFParserConfig {
  /** Max pages to parse (default: all) */
  maxPages?: number;
  /** Remove excessive whitespace (default: true) */
  normalizeWhitespace?: boolean;
}
 
// ── Simple built-in PDF text extractor ─────────────────────────────
 
/**
 * Minimal PDF text extraction — extracts text from stream objects.
 * For production use with complex PDFs, install `pdf-parse` as optional dep.
 */
function extractTextFromPDFBuffer(buffer: Buffer): { pages: string[]; metadata: Record<string, string> } {
  const content = buffer.toString('latin1');
  const pages: string[] = [];
  const metadata: Record<string, string> = {};
 
  // Extract metadata from Info dictionary
  const titleMatch = content.match(/\/Title\s*\(([^)]*)\)/);
  if (titleMatch) metadata.title = titleMatch[1];
  const authorMatch = content.match(/\/Author\s*\(([^)]*)\)/);
  if (authorMatch) metadata.author = authorMatch[1];
 
  // Extract text between BT...ET blocks (text objects)
  const textBlocks: string[] = [];
  const btEtRegex = /BT\s([\s\S]*?)ET/g;
  let match: RegExpExecArray | null;
 
  while ((match = btEtRegex.exec(content)) !== null) {
    const block = match[1];
    // Extract text from Tj, TJ, ' and " operators
    const tjRegex = /\(([^)]*)\)\s*Tj/g;
    let tjMatch: RegExpExecArray | null;
    while ((tjMatch = tjRegex.exec(block)) !== null) {
      textBlocks.push(tjMatch[1]);
    }
    // TJ array
    const tjArrayRegex = /\[([^\]]*)\]\s*TJ/g;
    let arrMatch: RegExpExecArray | null;
    while ((arrMatch = tjArrayRegex.exec(block)) !== null) {
      const inner = arrMatch[1];
      const strRegex = /\(([^)]*)\)/g;
      let strMatch: RegExpExecArray | null;
      while ((strMatch = strRegex.exec(inner)) !== null) {
        textBlocks.push(strMatch[1]);
      }
    }
  }
 
  // Simple page splitting by page markers
  const pageMarkers = content.split(/\/Type\s*\/Page[^s]/);
  if (pageMarkers.length > 1) {
    // Distribute text blocks across pages roughly
    const blocksPerPage = Math.max(1, Math.ceil(textBlocks.length / (pageMarkers.length - 1)));
    for (let i = 0; i < pageMarkers.length - 1; i++) {
      const start = i * blocksPerPage;
      const end = Math.min(start + blocksPerPage, textBlocks.length);
      pages.push(textBlocks.slice(start, end).join(' '));
    }
  } else {
    pages.push(textBlocks.join(' '));
  }
 
  return { pages, metadata };
}
 
function normalizeWS(text: string): string {
  return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
}
 
// ── Main API ───────────────────────────────────────────────────────
 
/**
 * Parse PDF from a Buffer.
 *
 * Tries `pdf-parse` if installed, falls back to built-in extractor.
 */
export async function parsePDF(buffer: Buffer, config: PDFParserConfig = {}): Promise<PDFParseResult> {
  const { maxPages, normalizeWhitespace = true } = config;
 
  let rawPages: string[];
  let metadata: Record<string, string> = {};
 
  try {
    // Try optional pdf-parse dependency
    // @ts-ignore — pdf-parse is an optional dependency
    const pdfParse = (await import('pdf-parse')).default;
    const data = await pdfParse(buffer, { max: maxPages ?? 0 });
    metadata = {
      ...(data.info?.Title ? { title: data.info.Title } : {}),
      ...(data.info?.Author ? { author: data.info.Author } : {}),
    };
    // pdf-parse doesn't give per-page easily; split by form feed
    rawPages = data.text.split('\f').filter((p: string) => p.trim());
    if (rawPages.length === 0) rawPages = [data.text];
  } catch {
    // Fallback to built-in
    const result = extractTextFromPDFBuffer(buffer);
    rawPages = result.pages;
    metadata = result.metadata;
  }
 
  if (maxPages && rawPages.length > maxPages) {
    rawPages = rawPages.slice(0, maxPages);
  }
 
  const pages: PDFPage[] = rawPages.map((text, i) => {
    const cleaned = normalizeWhitespace ? normalizeWS(text) : text;
    return { pageNumber: i + 1, text: cleaned, charCount: cleaned.length };
  });
 
  const fullText = pages.map(p => p.text).join('\n\n');
 
  return {
    text: fullText,
    pages,
    totalPages: pages.length,
    totalChars: fullText.length,
    metadata,
  };
}
 
/**
 * Parse PDF and split into chunks of roughly `chunkSize` characters.
 */
export async function parsePDFChunked(
  buffer: Buffer,
  options: PDFParserConfig & { chunkSize?: number; overlap?: number } = {}
): Promise<{ chunks: string[]; metadata?: Record<string, string> }> {
  const { chunkSize = 1000, overlap = 200, ...parserConfig } = options;
  const result = await parsePDF(buffer, parserConfig);
 
  const chunks: string[] = [];
  let offset = 0;
  const text = result.text;
 
  while (offset < text.length) {
    const end = Math.min(offset + chunkSize, text.length);
    chunks.push(text.slice(offset, end));
    offset = end - overlap;
    if (offset >= text.length - overlap) break;
  }
  // Ensure last chunk
  if (chunks.length === 0 && text.length > 0) chunks.push(text);
 
  return { chunks, metadata: result.metadata };
}