All files / Cognigy-CLI/build/lib/knowledgeAI/extractionProvider lsExtractor.js

0% Statements 0/53
0% Branches 0/28
0% Functions 0/3
0% Lines 0/52

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112                                                                                                                                                                                                                               
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.lsExtractor = void 0;
const { convert } = require('html-to-text');
const text_1 = require("@langchain/classic/document_loaders/fs/text");
const pdf_1 = require("@langchain/community/document_loaders/fs/pdf");
const docx_1 = require("@langchain/community/document_loaders/fs/docx");
const csv_1 = require("@langchain/community/document_loaders/fs/csv");
const json_1 = require("@langchain/classic/document_loaders/fs/json");
const epub_1 = require("@langchain/community/document_loaders/fs/epub");
const srt_1 = require("@langchain/community/document_loaders/fs/srt");
const cheerio_1 = require("@langchain/community/document_loaders/web/cheerio");
const playwright_1 = require("@langchain/community/document_loaders/web/playwright");
const textSplitter_1 = require("../../../utils/textSplitter");
const DefaultSplitters = {
    text: 'RecursiveCharacterTextSplitter',
    pdf: 'RecursiveCharacterTextSplitter',
    docx: 'RecursiveCharacterTextSplitter',
    csv: 'RecursiveCharacterTextSplitter',
    json: 'RecursiveCharacterTextSplitter',
    jsonl: 'RecursiveCharacterTextSplitter',
    epub: 'RecursiveCharacterTextSplitter',
    srt: 'RecursiveCharacterTextSplitter',
    cheerio: 'RecursiveCharacterTextSplitter',
    playwright: 'RecursiveCharacterTextSplitter',
    md: 'MarkdownSplitter',
};
const lsExtractor = async (type, options) => {
    let documentLoader;
    switch (type) {
        case 'text':
            documentLoader = new text_1.TextLoader(options.inputFile);
            break;
        case 'pdf':
            documentLoader = new pdf_1.PDFLoader(options.inputFile, options.additionalParameters || { splitPages: false });
            break;
        case 'docx':
            documentLoader = new docx_1.DocxLoader(options.inputFile);
            break;
        case 'csv':
            documentLoader = new csv_1.CSVLoader(options.inputFile, options.additionalParameters);
            break;
        case 'epub':
            documentLoader = new epub_1.EPubLoader(options.inputFile, options.additionalParameters || { splitChapters: true });
            break;
        case 'json':
            documentLoader = new json_1.JSONLoader(options.inputFile, options.additionalParameters);
            break;
        case 'jsonl':
            documentLoader = new json_1.JSONLinesLoader(options.inputFile, options.additionalParameters);
            break;
        case 'md':
            documentLoader = new text_1.TextLoader(options.inputFile);
            break;
        case 'srt':
            documentLoader = new srt_1.SRTLoader(options.inputFile);
            break;
        case 'cheerio':
            if (options.url) {
                documentLoader = new cheerio_1.CheerioWebBaseLoader(options.url, options.additionalParameters);
            }
            else {
                documentLoader = new text_1.TextLoader(options.inputFile);
            }
            break;
        case 'playwright':
            if (options.url) {
                documentLoader = new playwright_1.PlaywrightWebBaseLoader(options.url, options.additionalParameters);
            }
            else {
                documentLoader = new text_1.TextLoader(options.inputFile);
            }
            break;
        default:
            documentLoader = new text_1.TextLoader(options.inputFile);
    }
    const docs = await documentLoader.load();
    docs.forEach((doc) => {
        if (type === 'playwright' || type === 'cheerio') {
            doc.pageContent = convert(doc.pageContent, options.additionalParameters || {
                wordwrap: false,
                ignoreHref: true,
                preserveNewlines: true,
                uppercaseHeadings: false,
                singleNewLineParagraphs: true,
                selectors: [
                    { selector: 'img', format: 'skip' },
                    {
                        selector: 'p',
                        options: { leadingLineBreaks: 0, trailingLineBreaks: 1 },
                    },
                    {
                        selector: 'pre',
                        options: { leadingLineBreaks: 0, trailingLineBreaks: 1 },
                    },
                ],
            });
        }
        doc.pageContent = doc?.pageContent
            ?.replace(/(\s){2,}/g, ' ')
            .replace(/\r/g, '')
            .replace(/\t/g, '')
            .replace(/\\n/g, '\n')
            .replace(/\\t/g, '\t')
            .replace(/(\r\n|\r|\n|\n |\n  |\n   | \n|  \n|   \n){2,}/g, '\n');
    });
    const splitDocuments = (await (0, textSplitter_1.splitDocs)(docs, options, DefaultSplitters[type] || 'RecursiveCharacterTextSplitter')).map((doc) => doc.pageContent);
    const textParagraphs = splitDocuments.join('\n\n');
    return textParagraphs;
};
exports.lsExtractor = lsExtractor;