All files / packages/tokenpatch/strategies tiktoken-tokenizer.ts

100% Statements 12/12
100% Branches 0/0
100% Functions 2/2
100% Lines 12/12

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 358x       8x   16x     20x 20x   20x       20x     1491x 1491x   1491x               1491x   20x    
import { getEncoding } from 'js-tiktoken';
import { Token } from '../types';
import { TokenizerStrategy } from '../tokenizer.interface';
 
export class TiktokenTokenizer implements TokenizerStrategy {
  // Use cl100k_base (GPT-4) as the standard encoding
  private enc = getEncoding('cl100k_base');
 
  tokenize(content: string): Token[] {
    const tokens: Token[] = [];
    const encoded = this.enc.encode(content);
    
    let currentIndex = 0;
    
    // Iterate through token IDs, decode them individually to get text and length.
    // This allows us to reconstruct the offsets (startIndex/endIndex).
    for (const tokenId of encoded) {
        // decoding a single token is the only way to get its exact text representation
        // to map back to the source string indices.
        const text = this.enc.decode([tokenId]);
        const length = text.length;
        
        tokens.push({
            text,
            type: 'bpe',
            startIndex: currentIndex,
            endIndex: currentIndex + length,
            // startPosition is not calculated for Tiktoken strategy as it's computationally expensive 
            // and not strictly required for the patching algorithm which relies on text matching.
        });
        currentIndex += length;
    }
    return tokens;
  }
}