All files / src/token-counter index.ts

100% Statements 50/50
93.75% Branches 15/16
100% Functions 7/7
100% Lines 50/50

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108                                                                  1x 1x 1x 1x 1x 1x 1x 1x     1x 1x 1x 1x 1x 1x     1x 1x       30x 30x   27x 27x       1x 13x 13x   13x 13x 5x 5x   13x 6x 6x 6x 6x 6x 6x 6x 6x   13x 4x 4x 4x 13x 13x             1x 9x 9x 9x 9x         1x 7x 7x  
/**
 * AgentKits — Token Counter Module
 *
 * Estimate token count for text using tiktoken-like heuristics.
 * Supports different model tokenizer approximations.
 *
 * Usage:
 *   import { countTokens, createTokenCounter } from 'agentkits/token-counter';
 *   const count = countTokens('Hello world', 'gpt-4o');
 */
 
// ── Types ──────────────────────────────────────────────────────────
 
export type TokenizerModel = 'cl100k' | 'p50k' | 'o200k' | 'default';
 
export interface TokenCounterConfig {
  /** Default tokenizer model to use */
  model?: TokenizerModel;
  /** Characters-per-token ratio override (default depends on model) */
  charsPerToken?: number;
}
 
export interface TokenCounter {
  /** Count tokens in text */
  count(text: string): number;
  /** Count tokens for chat messages */
  countMessages(messages: Array<{ role: string; content: string }>): number;
  /** Estimate cost given price per 1M tokens */
  estimateCost(text: string, pricePerMillion: number): number;
}
 
// ── Model-to-tokenizer mapping ─────────────────────────────────────
 
const MODEL_TOKENIZER_MAP: Record<string, TokenizerModel> = {
  'gpt-4o': 'o200k',
  'gpt-4o-mini': 'o200k',
  'gpt-4-turbo': 'cl100k',
  'gpt-4': 'cl100k',
  'gpt-3.5-turbo': 'cl100k',
  'text-davinci-003': 'p50k',
};
 
// Average characters per token for each tokenizer family
const CHARS_PER_TOKEN: Record<TokenizerModel, number> = {
  cl100k: 4.0,
  p50k: 4.0,
  o200k: 3.8,
  default: 4.0,
};
 
// Overhead tokens per message for chat format
const MESSAGE_OVERHEAD = 4; // <|im_start|>role\ncontent<|im_end|>
const CHAT_OVERHEAD = 3; // every reply is primed with <|im_start|>assistant
 
// ── Heuristic tokenizer ────────────────────────────────────────────
 
function estimateTokens(text: string, charsPerToken: number): number {
  if (!text) return 0;
  // Heuristic: count by character ratio, with adjustments for whitespace/punctuation
  return Math.ceil(text.length / charsPerToken);
}
 
// ── Factory ────────────────────────────────────────────────────────
 
export function createTokenCounter(config: TokenCounterConfig = {}): TokenCounter {
  const model = config.model ?? 'default';
  const charsPerToken = config.charsPerToken ?? CHARS_PER_TOKEN[model] ?? CHARS_PER_TOKEN.default;
 
  return {
    count(text: string): number {
      return estimateTokens(text, charsPerToken);
    },
 
    countMessages(messages: Array<{ role: string; content: string }>): number {
      let total = CHAT_OVERHEAD;
      for (const msg of messages) {
        total += MESSAGE_OVERHEAD;
        total += estimateTokens(msg.role, charsPerToken);
        total += estimateTokens(msg.content, charsPerToken);
      }
      return total;
    },
 
    estimateCost(text: string, pricePerMillion: number): number {
      const tokens = estimateTokens(text, charsPerToken);
      return (tokens / 1_000_000) * pricePerMillion;
    },
  };
}
 
// ── Convenience ────────────────────────────────────────────────────
 
/**
 * Quick token count for a string. Optionally specify a model name for better accuracy.
 */
export function countTokens(text: string, model?: string): number {
  const tokenizer = model ? (MODEL_TOKENIZER_MAP[model] ?? 'default') : 'default';
  const charsPerToken = CHARS_PER_TOKEN[tokenizer];
  return estimateTokens(text, charsPerToken);
}
 
/**
 * Resolve which tokenizer family a model uses.
 */
export function getTokenizerForModel(model: string): TokenizerModel {
  return MODEL_TOKENIZER_MAP[model] ?? 'default';
}