All files / src/evals index.ts

76.76% Statements 76/99
70.9% Branches 39/55
80.95% Functions 17/21
78.35% Lines 76/97

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291                                                                                            21x 21x         21x       21x 21x         21x 21x       21x 21x 21x 21x                       11x   11x 2x       9x 9x         9x 90x 27x               3x   3x         3x 3x         3x 24x 22x               12x 1x       11x 11x           11x 11x         11x 11x         11x             15x         15x 15x           15x 15x         15x 15x         15x               9x 9x 9x   9x 1x     8x   8x 29x 29x 29x 1x   28x     29x                                             14x 14x 14x   14x       14x 14x 14x 1x   13x     14x                                     2x 2x   2x 3x 3x 3x     2x             2x   12x 12x      
import { readFileSync, readdirSync, existsSync } from 'node:fs';
import { join, basename, dirname } from 'node:path';
import { parse as parseYaml } from 'yaml';
import {
  DEFAULT_EVAL_MODEL,
  DEFAULT_EVAL_RUNTIME,
  EvalFileSchema,
  EvalScenarioFileSchema,
} from './types.js';
import type { EvalFile, EvalMeta, EvalScenarioFile } from './types.js';
import type { RuntimeName } from '../sdk/runtimes/types.js';
 
export type { EvalMeta };
 
export interface EvalScenarioSetOptions {
  /** Category/suite name, and directory under evals/ containing JSON scenarios. */
  category: string;
  /** Skill to run, relative to evals/ directory. */
  skill: string;
  /** Default runtime for all scenarios in this set. */
  runtime?: RuntimeName;
  /** Default model for all scenarios in this set. */
  model?: string;
  /** Optional evals directory override for tests. */
  baseDir?: string;
}
 
/**
 * Get the default evals directory path.
 */
function getEvalsDir(): string {
  return join(import.meta.dirname, '..', '..', 'evals');
}
 
function fallbackSkillName(skillPath: string): string {
  const filename = basename(skillPath);
  return filename === 'SKILL.md'
    ? basename(dirname(skillPath))
    : filename.replace(/\.[^.]+$/, '');
}
 
/**
 * Resolve the skill name used in eval output from skill frontmatter.
 */
export function resolveEvalSkillName(skillPath: string): string {
  let content: string;
  try {
    content = readFileSync(skillPath, 'utf-8');
  } catch {
    return fallbackSkillName(skillPath);
  }
 
  Iif (!content.startsWith('---')) {
    return fallbackSkillName(skillPath);
  }
 
  const end = content.indexOf('\n---', 3);
  Iif (end === -1) {
    return fallbackSkillName(skillPath);
  }
 
  let parsed: unknown;
  try {
    parsed = parseYaml(content.slice(3, end));
  } catch {
    return fallbackSkillName(skillPath);
  }
  Eif (parsed && typeof parsed === 'object' && 'name' in parsed) {
    const name = (parsed as { name?: unknown }).name;
    Eif (typeof name === 'string' && name.trim()) {
      return name.trim();
    }
  }
 
  return fallbackSkillName(skillPath);
}
 
/**
 * Discover all YAML eval files in the evals directory.
 * Returns absolute paths to .yaml files, sorted alphabetically.
 */
export function discoverEvalFiles(baseDir?: string): string[] {
  const evalsDir = baseDir ?? getEvalsDir();
 
  if (!existsSync(evalsDir)) {
    return [];
  }
 
  let entries: string[];
  try {
    entries = readdirSync(evalsDir);
  } catch {
    return [];
  }
 
  return entries
    .filter((e) => e.endsWith('.yaml') || e.endsWith('.yml'))
    .map((e) => join(evalsDir, e))
    .sort();
}
 
/**
 * Discover standalone JSON scenario files in evals/<category>/.
 */
export function discoverEvalScenarioFiles(category: string, baseDir?: string): string[] {
  const scenarioDir = join(baseDir ?? getEvalsDir(), category);
 
  Iif (!existsSync(scenarioDir)) {
    return [];
  }
 
  let entries: string[];
  try {
    entries = readdirSync(scenarioDir);
  } catch {
    return [];
  }
 
  return entries
    .filter((entry) => entry.endsWith('.json'))
    .map((entry) => join(scenarioDir, entry))
    .sort();
}
 
/**
 * Load and validate a YAML eval file.
 */
export function loadEvalFile(filePath: string): EvalFile {
  if (!existsSync(filePath)) {
    throw new Error(`Eval file not found: ${filePath}`);
  }
 
  let content: string;
  try {
    content = readFileSync(filePath, 'utf-8');
  } catch (error) {
    throw new Error(`Failed to read ${filePath}: ${error}`, { cause: error });
  }
 
  let parsed: unknown;
  try {
    parsed = parseYaml(content);
  } catch (error) {
    throw new Error(`Failed to parse YAML in ${filePath}: ${error}`);
  }
 
  const validated = EvalFileSchema.safeParse(parsed);
  Iif (!validated.success) {
    const issues = validated.error.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join(', ');
    throw new Error(`Invalid eval file ${filePath}: ${issues}`);
  }
 
  return validated.data;
}
 
/**
 * Load and validate a standalone JSON eval scenario.
 */
export function loadEvalScenarioFile(filePath: string): EvalScenarioFile {
  Iif (!existsSync(filePath)) {
    throw new Error(`Eval scenario file not found: ${filePath}`);
  }
 
  let content: string;
  try {
    content = readFileSync(filePath, 'utf-8');
  } catch (error) {
    throw new Error(`Failed to read ${filePath}: ${error}`, { cause: error });
  }
 
  let parsed: unknown;
  try {
    parsed = JSON.parse(content);
  } catch (error) {
    throw new Error(`Failed to parse JSON in ${filePath}: ${error}`);
  }
 
  const validated = EvalScenarioFileSchema.safeParse(parsed);
  Iif (!validated.success) {
    const issues = validated.error.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join(', ');
    throw new Error(`Invalid eval scenario file ${filePath}: ${issues}`);
  }
 
  return validated.data;
}
 
/**
 * Resolve all eval scenarios from a YAML file into executable EvalMeta objects.
 * Resolves relative paths for skills and fixtures against the evals directory.
 */
export function resolveEvalMetas(evalFile: EvalFile, yamlPath: string): EvalMeta[] {
  const evalsDir = join(yamlPath, '..');
  const category = basename(yamlPath).replace(/\.ya?ml$/, '');
  const skillPath = join(evalsDir, evalFile.skill);
 
  if (!existsSync(skillPath)) {
    throw new Error(`Eval skill not found in ${yamlPath}: ${evalFile.skill}`);
  }
 
  const skillName = resolveEvalSkillName(skillPath);
 
  return evalFile.evals.map((scenario) => {
    const filePaths = scenario.files.map((file) => {
      const filePath = join(evalsDir, file);
      if (!existsSync(filePath)) {
        throw new Error(`Eval fixture not found for ${category}/${scenario.name}: ${file}`);
      }
      return filePath;
    });
 
    return {
      name: scenario.name,
      category,
      skillName,
      given: scenario.given,
      skillPath,
      filePaths,
      model: scenario.model ?? evalFile.model,
      runtime: scenario.runtime ?? evalFile.runtime,
      should_find: scenario.should_find,
      should_not_find: scenario.should_not_find,
    };
  });
}
 
/**
 * Resolve one standalone JSON scenario into executable EvalMeta.
 */
export function resolveEvalScenarioMeta(
  scenario: EvalScenarioFile,
  scenarioPath: string,
  options: EvalScenarioSetOptions
): EvalMeta {
  const evalsDir = options.baseDir ?? getEvalsDir();
  const name = scenario.name ?? basename(scenarioPath).replace(/\.json$/, '');
  const skillPath = join(evalsDir, options.skill);
 
  Iif (!existsSync(skillPath)) {
    throw new Error(`Eval skill not found for ${options.category}/${name}: ${options.skill}`);
  }
 
  const filePaths = scenario.files.map((file) => {
    const filePath = join(evalsDir, file);
    if (!existsSync(filePath)) {
      throw new Error(`Eval fixture not found for ${options.category}/${name}: ${file}`);
    }
    return filePath;
  });
 
  return {
    name,
    category: options.category,
    skillName: resolveEvalSkillName(skillPath),
    given: scenario.given,
    skillPath,
    filePaths,
    model: scenario.model ?? options.model ?? DEFAULT_EVAL_MODEL,
    runtime: scenario.runtime ?? options.runtime ?? DEFAULT_EVAL_RUNTIME,
    should_find: scenario.should_find,
    should_not_find: scenario.should_not_find,
  };
}
 
/**
 * Discover and load all evals from YAML files. Returns a flat list of
 * resolved EvalMeta objects ready for execution.
 */
export function discoverEvals(baseDir?: string): EvalMeta[] {
  const yamlFiles = discoverEvalFiles(baseDir);
  const allEvals: EvalMeta[] = [];
 
  for (const yamlPath of yamlFiles) {
    const evalFile = loadEvalFile(yamlPath);
    const metas = resolveEvalMetas(evalFile, yamlPath);
    allEvals.push(...metas);
  }
 
  return allEvals;
}
 
/**
 * Discover and load standalone JSON scenarios for a category.
 */
export function discoverEvalScenarios(options: EvalScenarioSetOptions): EvalMeta[] {
  return discoverEvalScenarioFiles(options.category, options.baseDir)
    .map((scenarioPath) => {
      const scenario = loadEvalScenarioFile(scenarioPath);
      return resolveEvalScenarioMeta(scenario, scenarioPath, options);
    });
}