Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | 8x 8x 16x 20x 20x 20x 20x 1491x 1491x 1491x 1491x 20x | import { getEncoding } from 'js-tiktoken';
import { Token } from '../types';
import { TokenizerStrategy } from '../tokenizer.interface';
export class TiktokenTokenizer implements TokenizerStrategy {
// Use cl100k_base (GPT-4) as the standard encoding
private enc = getEncoding('cl100k_base');
tokenize(content: string): Token[] {
const tokens: Token[] = [];
const encoded = this.enc.encode(content);
let currentIndex = 0;
// Iterate through token IDs, decode them individually to get text and length.
// This allows us to reconstruct the offsets (startIndex/endIndex).
for (const tokenId of encoded) {
// decoding a single token is the only way to get its exact text representation
// to map back to the source string indices.
const text = this.enc.decode([tokenId]);
const length = text.length;
tokens.push({
text,
type: 'bpe',
startIndex: currentIndex,
endIndex: currentIndex + length,
// startPosition is not calculated for Tiktoken strategy as it's computationally expensive
// and not strictly required for the patching algorithm which relies on text matching.
});
currentIndex += length;
}
return tokens;
}
} |