Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.lsExtractor = void 0; const { convert } = require('html-to-text'); const text_1 = require("@langchain/classic/document_loaders/fs/text"); const pdf_1 = require("@langchain/community/document_loaders/fs/pdf"); const docx_1 = require("@langchain/community/document_loaders/fs/docx"); const csv_1 = require("@langchain/community/document_loaders/fs/csv"); const json_1 = require("@langchain/classic/document_loaders/fs/json"); const epub_1 = require("@langchain/community/document_loaders/fs/epub"); const srt_1 = require("@langchain/community/document_loaders/fs/srt"); const cheerio_1 = require("@langchain/community/document_loaders/web/cheerio"); const playwright_1 = require("@langchain/community/document_loaders/web/playwright"); const textSplitter_1 = require("../../../utils/textSplitter"); const DefaultSplitters = { text: 'RecursiveCharacterTextSplitter', pdf: 'RecursiveCharacterTextSplitter', docx: 'RecursiveCharacterTextSplitter', csv: 'RecursiveCharacterTextSplitter', json: 'RecursiveCharacterTextSplitter', jsonl: 'RecursiveCharacterTextSplitter', epub: 'RecursiveCharacterTextSplitter', srt: 'RecursiveCharacterTextSplitter', cheerio: 'RecursiveCharacterTextSplitter', playwright: 'RecursiveCharacterTextSplitter', md: 'MarkdownSplitter', }; const lsExtractor = async (type, options) => { let documentLoader; switch (type) { case 'text': documentLoader = new text_1.TextLoader(options.inputFile); break; case 'pdf': documentLoader = new pdf_1.PDFLoader(options.inputFile, options.additionalParameters || { splitPages: false }); break; case 'docx': documentLoader = new docx_1.DocxLoader(options.inputFile); break; case 'csv': documentLoader = new csv_1.CSVLoader(options.inputFile, options.additionalParameters); break; case 'epub': documentLoader = new epub_1.EPubLoader(options.inputFile, options.additionalParameters || { splitChapters: true }); break; case 'json': documentLoader = new json_1.JSONLoader(options.inputFile, options.additionalParameters); break; case 'jsonl': documentLoader = new json_1.JSONLinesLoader(options.inputFile, options.additionalParameters); break; case 'md': documentLoader = new text_1.TextLoader(options.inputFile); break; case 'srt': documentLoader = new srt_1.SRTLoader(options.inputFile); break; case 'cheerio': if (options.url) { documentLoader = new cheerio_1.CheerioWebBaseLoader(options.url, options.additionalParameters); } else { documentLoader = new text_1.TextLoader(options.inputFile); } break; case 'playwright': if (options.url) { documentLoader = new playwright_1.PlaywrightWebBaseLoader(options.url, options.additionalParameters); } else { documentLoader = new text_1.TextLoader(options.inputFile); } break; default: documentLoader = new text_1.TextLoader(options.inputFile); } const docs = await documentLoader.load(); docs.forEach((doc) => { if (type === 'playwright' || type === 'cheerio') { doc.pageContent = convert(doc.pageContent, options.additionalParameters || { wordwrap: false, ignoreHref: true, preserveNewlines: true, uppercaseHeadings: false, singleNewLineParagraphs: true, selectors: [ { selector: 'img', format: 'skip' }, { selector: 'p', options: { leadingLineBreaks: 0, trailingLineBreaks: 1 }, }, { selector: 'pre', options: { leadingLineBreaks: 0, trailingLineBreaks: 1 }, }, ], }); } doc.pageContent = doc?.pageContent ?.replace(/(\s){2,}/g, ' ') .replace(/\r/g, '') .replace(/\t/g, '') .replace(/\\n/g, '\n') .replace(/\\t/g, '\t') .replace(/(\r\n|\r|\n|\n |\n |\n | \n| \n| \n){2,}/g, '\n'); }); const splitDocuments = (await (0, textSplitter_1.splitDocs)(docs, options, DefaultSplitters[type] || 'RecursiveCharacterTextSplitter')).map((doc) => doc.pageContent); const textParagraphs = splitDocuments.join('\n\n'); return textParagraphs; }; exports.lsExtractor = lsExtractor; |