Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | 1x 5x 5x 5x 9x 9x 9x 9x 10x 10x 10x 10x 10x 10x 10x 10x 10x 10x 10x 10x 5x 5x 5x 5x 5x 5x 10x 5x 5x 5x 5x 5x 5x 5x 5x 5x 10x 9x 9x 9x 4x 4x 4x 4x 4x 9x 9x 2x 2x 2x 2x 2x 2x 3x 3x 2x 2x 1x 5x 5x 5x 5x 5x 5x 4x 4x 4x 4x 5x 5x 5x | /**
* AgentKits — Evaluation Module
*
* Compare outputs from different models/providers.
* Metrics: latency, cost, output quality (via judge model).
*
* Usage:
* import { evaluate, batchEvaluate } from 'agentkits/evaluation';
*/
import { createChat, type LLMProvider, type ChatConfig } from '../llm/index.js';
// ── Types ──────────────────────────────────────────────────────────
export interface EvalTarget {
name: string;
provider: LLMProvider;
model?: string;
apiKey?: string;
baseUrl?: string;
}
export interface EvalCase {
prompt: string;
system?: string;
/** Expected output for reference (optional) */
expected?: string;
}
export interface EvalMetrics {
latencyMs: number;
outputLength: number;
estimatedTokens: number;
/** 1-10 quality score from judge model (if configured) */
qualityScore?: number;
qualityReason?: string;
}
export interface EvalResult {
target: string;
output: string;
metrics: EvalMetrics;
error?: string;
}
export interface EvalSummary {
case: EvalCase;
results: EvalResult[];
winner?: string;
}
export interface EvalConfig {
targets: EvalTarget[];
/** Judge model for quality scoring */
judge?: ChatConfig;
/** Timeout per request in ms (default: 30000) */
timeout?: number;
}
// ── Helpers ─────────────────────────────────────────────────────────
function estimateTokens(text: string): number {
return Math.ceil(text.length / 4);
}
async function judgeQuality(
judgeConfig: ChatConfig,
prompt: string,
output: string,
expected?: string,
): Promise<{ score: number; reason: string }> {
const judge = createChat(judgeConfig);
const judgePrompt = `Rate the following LLM output on a scale of 1-10.
Prompt: ${prompt}
${expected ? `Expected: ${expected}\n` : ''}
Output: ${output}
Respond in JSON: {"score": <1-10>, "reason": "<brief reason>"}`;
try {
const response = await judge.complete(judgePrompt, { temperature: 0 });
const parsed = JSON.parse(response.match(/\{[\s\S]*\}/)?.[0] ?? '{}');
return { score: parsed.score ?? 5, reason: parsed.reason ?? '' };
} catch {
return { score: 0, reason: 'Judge failed' };
}
}
// ── Public API ─────────────────────────────────────────────────────
/** Evaluate a single prompt across all targets */
export async function evaluate(config: EvalConfig, evalCase: EvalCase): Promise<EvalSummary> {
const results: EvalResult[] = [];
const timeout = config.timeout ?? 30000;
for (const target of config.targets) {
const start = Date.now();
try {
const chat = createChat({
provider: target.provider,
model: target.model,
apiKey: target.apiKey,
baseUrl: target.baseUrl,
});
const output = await Promise.race([
chat.complete(evalCase.prompt, { system: evalCase.system }),
new Promise<never>((_, reject) => setTimeout(() => reject(new Error('Timeout')), timeout)),
]);
const latencyMs = Date.now() - start;
const metrics: EvalMetrics = {
latencyMs,
outputLength: output.length,
estimatedTokens: estimateTokens(output),
};
if (config.judge) {
const { score, reason } = await judgeQuality(config.judge, evalCase.prompt, output, evalCase.expected);
metrics.qualityScore = score;
metrics.qualityReason = reason;
}
results.push({ target: target.name, output, metrics });
} catch (err: any) {
results.push({
target: target.name,
output: '',
metrics: { latencyMs: Date.now() - start, outputLength: 0, estimatedTokens: 0 },
error: err.message ?? String(err),
});
}
}
// Determine winner (by quality score if available, else by latency)
const successful = results.filter(r => !r.error);
let winner: string | undefined;
if (successful.length > 0) {
if (successful[0].metrics.qualityScore !== undefined) {
winner = successful.sort((a, b) => (b.metrics.qualityScore ?? 0) - (a.metrics.qualityScore ?? 0))[0].target;
} else {
winner = successful.sort((a, b) => a.metrics.latencyMs - b.metrics.latencyMs)[0].target;
}
}
return { case: evalCase, results, winner };
}
/** Batch evaluate multiple cases */
export async function batchEvaluate(
config: EvalConfig,
cases: EvalCase[],
): Promise<EvalSummary[]> {
const summaries: EvalSummary[] = [];
for (const c of cases) {
summaries.push(await evaluate(config, c));
}
return summaries;
}
/** Format evaluation results as a CLI-friendly table string */
export function formatEvalResults(summary: EvalSummary): string {
const lines: string[] = [];
lines.push(`Prompt: ${summary.case.prompt.slice(0, 80)}${summary.case.prompt.length > 80 ? '...' : ''}`);
lines.push('─'.repeat(80));
lines.push(`${'Target'.padEnd(20)} ${'Latency'.padEnd(10)} ${'Tokens'.padEnd(8)} ${'Quality'.padEnd(8)} ${'Error'.padEnd(20)}`);
lines.push('─'.repeat(80));
for (const r of summary.results) {
lines.push(
`${r.target.padEnd(20)} ${(r.metrics.latencyMs + 'ms').padEnd(10)} ${String(r.metrics.estimatedTokens).padEnd(8)} ${(r.metrics.qualityScore?.toString() ?? '-').padEnd(8)} ${(r.error ?? '').padEnd(20)}`
);
}
if (summary.winner) lines.push(`\n🏆 Winner: ${summary.winner}`);
return lines.join('\n');
}
|