All files / src/modules chunker.ts

98.27% Statements 114/116
96.87% Branches 31/32
100% Functions 7/7
98.27% Lines 114/116

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179                                                                                      1x   1x 1x 1x 1x   2x 2x 2x 2x 2x   3x 3x 3x 3x 3x 3x   3x 120x 31x 31x 31x 31x 31x 31x 31x 31x     31x 31x 31x 50x 50x 19x 19x 19x 31x 31x 31x 31x 31x 120x 120x 120x 120x   3x 3x 3x 3x 3x 3x 3x 3x 3x 3x   3x 3x       1x 11x 11x 11x 11x 11x 11x 11x   11x 11x 9x   7x 9x 1x 1x 1x 9x 1x 1x 1x 9x   1x 1x 1x 20x 20x 20x     20x 1x 1x 9x 9x 4x 4x 4x 52x 52x 52x 52x 52x 52x 52x 52x 52x 52x 52x 52x 4x 4x 9x 9x   11x 6x 6x 11x 11x         1x 9x 9x  
/**
 * AgentKits — Document Chunker Module
 *
 * Split text into chunks with configurable strategies.
 *
 * Usage:
 *   import { createChunker } from 'agentkits/chunker';
 *   const chunker = createChunker({ strategy: 'paragraph', chunkSize: 1000 });
 *   const chunks = chunker.chunk(longText);
 */
 
// ── Types ──────────────────────────────────────────────────────────
 
export type ChunkStrategy = 'fixed' | 'sentence' | 'paragraph' | 'semantic';
 
export interface ChunkerConfig {
  /** Chunking strategy (default: 'fixed') */
  strategy?: ChunkStrategy;
  /** Target chunk size in characters (default: 1000) */
  chunkSize?: number;
  /** Overlap between chunks in characters (default: 200) */
  overlap?: number;
  /** Custom separator for splitting (used with 'fixed' strategy) */
  separator?: string;
  /** Keep separator in output (default: true) */
  keepSeparator?: boolean;
}
 
export interface Chunk {
  index: number;
  text: string;
  charCount: number;
  startOffset: number;
  endOffset: number;
}
 
export interface Chunker {
  chunk(text: string): Chunk[];
  readonly config: Readonly<Required<ChunkerConfig>>;
}
 
// ── Helpers ────────────────────────────────────────────────────────
 
function splitSentences(text: string): string[] {
  // Split on sentence-ending punctuation followed by space or newline
  return text
    .split(/(?<=[.!?。!?])\s+/)
    .filter(s => s.trim().length > 0);
}
 
function splitParagraphs(text: string): string[] {
  return text
    .split(/\n\s*\n/)
    .filter(p => p.trim().length > 0);
}
 
function mergeSegments(segments: string[], chunkSize: number, overlap: number): Chunk[] {
  const chunks: Chunk[] = [];
  let currentChunks: string[] = [];
  let currentLen = 0;
  let globalOffset = 0;
  let chunkStartOffset = 0;
 
  for (const segment of segments) {
    if (currentLen + segment.length > chunkSize && currentChunks.length > 0) {
      const text = currentChunks.join(' ');
      chunks.push({
        index: chunks.length,
        text,
        charCount: text.length,
        startOffset: chunkStartOffset,
        endOffset: chunkStartOffset + text.length,
      });
 
      // Calculate overlap: keep trailing segments that fit within overlap
      let overlapLen = 0;
      let keepFrom = currentChunks.length;
      for (let i = currentChunks.length - 1; i >= 0; i--) {
        const segLen = currentChunks[i].length + 1; // +1 for space
        if (overlapLen + segLen > overlap) break;
        overlapLen += segLen;
        keepFrom = i;
      }
      const kept = currentChunks.slice(keepFrom);
      chunkStartOffset = globalOffset - kept.reduce((s, c) => s + c.length + 1, 0) + 1;
      currentChunks = [...kept];
      currentLen = kept.reduce((s, c) => s + c.length + 1, 0);
    }
    currentChunks.push(segment);
    currentLen += segment.length + 1;
    globalOffset += segment.length + 1;
  }
 
  if (currentChunks.length > 0) {
    const text = currentChunks.join(' ');
    chunks.push({
      index: chunks.length,
      text,
      charCount: text.length,
      startOffset: chunkStartOffset,
      endOffset: chunkStartOffset + text.length,
    });
  }
 
  return chunks;
}
 
// ── Factory ────────────────────────────────────────────────────────
 
export function createChunker(userConfig: ChunkerConfig = {}): Chunker {
  const config: Required<ChunkerConfig> = {
    strategy: userConfig.strategy ?? 'fixed',
    chunkSize: userConfig.chunkSize ?? 1000,
    overlap: userConfig.overlap ?? 200,
    separator: userConfig.separator ?? '\n',
    keepSeparator: userConfig.keepSeparator ?? true,
  };
 
  return {
    chunk(text: string): Chunk[] {
      if (!text || text.trim().length === 0) return [];
 
      switch (config.strategy) {
        case 'sentence': {
          const sentences = splitSentences(text);
          return mergeSegments(sentences, config.chunkSize, config.overlap);
        }
        case 'paragraph': {
          const paragraphs = splitParagraphs(text);
          return mergeSegments(paragraphs, config.chunkSize, config.overlap);
        }
        case 'semantic': {
          // Semantic: split by paragraphs first, then by sentences if still too large
          const paras = splitParagraphs(text);
          const segments: string[] = [];
          for (const p of paras) {
            if (p.length <= config.chunkSize) {
              segments.push(p);
            } else {
              segments.push(...splitSentences(p));
            }
          }
          return mergeSegments(segments, config.chunkSize, config.overlap);
        }
        case 'fixed':
        default: {
          const chunks: Chunk[] = [];
          let offset = 0;
          while (offset < text.length) {
            const end = Math.min(offset + config.chunkSize, text.length);
            const chunk = text.slice(offset, end);
            chunks.push({
              index: chunks.length,
              text: chunk,
              charCount: chunk.length,
              startOffset: offset,
              endOffset: end,
            });
            offset = end - config.overlap;
            if (offset <= chunks[chunks.length - 1].startOffset) break; // prevent infinite
          }
          return chunks;
        }
      }
    },
 
    get config() {
      return config;
    },
  };
}
 
// ── Convenience ────────────────────────────────────────────────────
 
/** Quick chunk text with defaults */
export function chunkText(text: string, options?: ChunkerConfig): Chunk[] {
  return createChunker(options).chunk(text);
}