Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | 1x 1x 1x 1x 1x 2x 2x 2x 2x 2x 3x 3x 3x 3x 3x 3x 3x 120x 31x 31x 31x 31x 31x 31x 31x 31x 31x 31x 31x 50x 50x 19x 19x 19x 31x 31x 31x 31x 31x 120x 120x 120x 120x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 1x 11x 11x 11x 11x 11x 11x 11x 11x 11x 9x 7x 9x 1x 1x 1x 9x 1x 1x 1x 9x 1x 1x 1x 20x 20x 20x 20x 1x 1x 9x 9x 4x 4x 4x 52x 52x 52x 52x 52x 52x 52x 52x 52x 52x 52x 52x 4x 4x 9x 9x 11x 6x 6x 11x 11x 1x 9x 9x | /**
* AgentKits — Document Chunker Module
*
* Split text into chunks with configurable strategies.
*
* Usage:
* import { createChunker } from 'agentkits/chunker';
* const chunker = createChunker({ strategy: 'paragraph', chunkSize: 1000 });
* const chunks = chunker.chunk(longText);
*/
// ── Types ──────────────────────────────────────────────────────────
export type ChunkStrategy = 'fixed' | 'sentence' | 'paragraph' | 'semantic';
export interface ChunkerConfig {
/** Chunking strategy (default: 'fixed') */
strategy?: ChunkStrategy;
/** Target chunk size in characters (default: 1000) */
chunkSize?: number;
/** Overlap between chunks in characters (default: 200) */
overlap?: number;
/** Custom separator for splitting (used with 'fixed' strategy) */
separator?: string;
/** Keep separator in output (default: true) */
keepSeparator?: boolean;
}
export interface Chunk {
index: number;
text: string;
charCount: number;
startOffset: number;
endOffset: number;
}
export interface Chunker {
chunk(text: string): Chunk[];
readonly config: Readonly<Required<ChunkerConfig>>;
}
// ── Helpers ────────────────────────────────────────────────────────
function splitSentences(text: string): string[] {
// Split on sentence-ending punctuation followed by space or newline
return text
.split(/(?<=[.!?。!?])\s+/)
.filter(s => s.trim().length > 0);
}
function splitParagraphs(text: string): string[] {
return text
.split(/\n\s*\n/)
.filter(p => p.trim().length > 0);
}
function mergeSegments(segments: string[], chunkSize: number, overlap: number): Chunk[] {
const chunks: Chunk[] = [];
let currentChunks: string[] = [];
let currentLen = 0;
let globalOffset = 0;
let chunkStartOffset = 0;
for (const segment of segments) {
if (currentLen + segment.length > chunkSize && currentChunks.length > 0) {
const text = currentChunks.join(' ');
chunks.push({
index: chunks.length,
text,
charCount: text.length,
startOffset: chunkStartOffset,
endOffset: chunkStartOffset + text.length,
});
// Calculate overlap: keep trailing segments that fit within overlap
let overlapLen = 0;
let keepFrom = currentChunks.length;
for (let i = currentChunks.length - 1; i >= 0; i--) {
const segLen = currentChunks[i].length + 1; // +1 for space
if (overlapLen + segLen > overlap) break;
overlapLen += segLen;
keepFrom = i;
}
const kept = currentChunks.slice(keepFrom);
chunkStartOffset = globalOffset - kept.reduce((s, c) => s + c.length + 1, 0) + 1;
currentChunks = [...kept];
currentLen = kept.reduce((s, c) => s + c.length + 1, 0);
}
currentChunks.push(segment);
currentLen += segment.length + 1;
globalOffset += segment.length + 1;
}
if (currentChunks.length > 0) {
const text = currentChunks.join(' ');
chunks.push({
index: chunks.length,
text,
charCount: text.length,
startOffset: chunkStartOffset,
endOffset: chunkStartOffset + text.length,
});
}
return chunks;
}
// ── Factory ────────────────────────────────────────────────────────
export function createChunker(userConfig: ChunkerConfig = {}): Chunker {
const config: Required<ChunkerConfig> = {
strategy: userConfig.strategy ?? 'fixed',
chunkSize: userConfig.chunkSize ?? 1000,
overlap: userConfig.overlap ?? 200,
separator: userConfig.separator ?? '\n',
keepSeparator: userConfig.keepSeparator ?? true,
};
return {
chunk(text: string): Chunk[] {
if (!text || text.trim().length === 0) return [];
switch (config.strategy) {
case 'sentence': {
const sentences = splitSentences(text);
return mergeSegments(sentences, config.chunkSize, config.overlap);
}
case 'paragraph': {
const paragraphs = splitParagraphs(text);
return mergeSegments(paragraphs, config.chunkSize, config.overlap);
}
case 'semantic': {
// Semantic: split by paragraphs first, then by sentences if still too large
const paras = splitParagraphs(text);
const segments: string[] = [];
for (const p of paras) {
if (p.length <= config.chunkSize) {
segments.push(p);
} else {
segments.push(...splitSentences(p));
}
}
return mergeSegments(segments, config.chunkSize, config.overlap);
}
case 'fixed':
default: {
const chunks: Chunk[] = [];
let offset = 0;
while (offset < text.length) {
const end = Math.min(offset + config.chunkSize, text.length);
const chunk = text.slice(offset, end);
chunks.push({
index: chunks.length,
text: chunk,
charCount: chunk.length,
startOffset: offset,
endOffset: end,
});
offset = end - config.overlap;
if (offset <= chunks[chunks.length - 1].startOffset) break; // prevent infinite
}
return chunks;
}
}
},
get config() {
return config;
},
};
}
// ── Convenience ────────────────────────────────────────────────────
/** Quick chunk text with defaults */
export function chunkText(text: string, options?: ChunkerConfig): Chunk[] {
return createChunker(options).chunk(text);
}
|