All files / src/evals runner.ts

38.46% Statements 25/65
3.84% Branches 1/26
25% Functions 2/8
38.09% Lines 24/63

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194                                                                                            1x   1x 8x   1x 1x 1x           1x 1x 1x 1x   1x   1x 1x         1x 1x 1x     1x 1x 1x 1x     1x 1x   1x 1x                                                                                                                                                                                                                    
import { basename, join, dirname } from 'node:path';
import { copyFileSync, cpSync, mkdirSync, mkdtempSync, rmSync, existsSync } from 'node:fs';
import { tmpdir } from 'node:os';
import { execGitNonInteractive } from '../utils/exec.js';
import { buildLocalEventContext } from '../cli/context.js';
import { resolveSkillAsync } from '../skills/loader.js';
import { runSkill } from '../sdk/runner.js';
import { formatEvalId } from './names.js';
import type { EvalMeta } from './types.js';
import type { Finding, SkillReport } from '../types/index.js';
import type { FindingProcessingEvent } from '../sdk/runner.js';
import type { RuntimeName } from '../sdk/runtimes/types.js';
 
export interface RunEvalOptions {
  /** Anthropic API key */
  apiKey: string;
  /** Override the model from the YAML spec */
  model?: string;
  /** Override the runtime from the YAML spec */
  runtime?: RuntimeName;
  /** Enable verbose logging */
  verbose?: boolean;
}
 
export interface EvalSkillRunResult {
  /** Display name (e.g. "code-review/case") */
  name: string;
  /** Eval metadata */
  meta: EvalMeta;
  /** Skill report from the agent run */
  report: SkillReport;
  /** Verbose logs from the agent run */
  logs: string[];
  /** Duration of the skill run in ms */
  durationMs: number;
}
 
/**
 * Set up a temporary git repository for an eval scenario.
 *
 * Creates a real git repo with a `main` commit containing the skill definition
 * and an `eval` branch containing only fixture files. This gives
 * the agent a real git environment to explore with Read/Grep and produces
 * real git diffs for the pipeline to parse.
 */
export function setupEvalRepo(meta: EvalMeta, log: (msg: string) => void): string {
  const tmpDir = mkdtempSync(join(tmpdir(), `warden-eval-${meta.name}-`));
 
  try {
    const git = (args: string[]) => execGitNonInteractive(args, { cwd: tmpDir });
 
    git(['init', '--initial-branch=main']);
    git(['config', 'user.email', 'eval@warden.dev']);
    git(['config', 'user.name', 'Warden Eval']);
 
    // Copy skill into repo. If it lives in a directory (skill-name/SKILL.md),
    // copy the whole directory to preserve resource subdirs (scripts/, references/).
    // For flat .md files, just copy the single file. Commit it on main so eval
    // diffs contain only fixture code, not the skill used to run the eval.
    const skillSrcDir = dirname(meta.skillPath);
    const skillMarker = join(skillSrcDir, 'SKILL.md');
    const skillDestDir = join(tmpDir, '.warden', 'skills');
    mkdirSync(skillDestDir, { recursive: true });
 
    if (existsSync(skillMarker)) {
      // Directory-format skill: copy entire directory to preserve resources
      const skillDirName = basename(skillSrcDir);
      cpSync(skillSrcDir, join(skillDestDir, skillDirName), { recursive: true });
    } else E{
      copyFileSync(meta.skillPath, join(skillDestDir, basename(meta.skillPath)));
    }
 
    git(['add', '.']);
    git(['commit', '-m', 'install eval skill']);
    git(['checkout', '-b', 'eval']);
 
    // Copy fixture files, preserving their parent directory name
    for (const srcPath of meta.filePaths) {
      const destDir = join(tmpDir, basename(dirname(srcPath)));
      mkdirSync(destDir, { recursive: true });
      copyFileSync(srcPath, join(destDir, basename(srcPath)));
    }
 
    git(['add', '.']);
    git(['commit', '-m', `add fixture: ${meta.name}`]);
 
    log(`Repo ready: ${tmpDir} (${meta.filePaths.length} file(s))`);
    return tmpDir;
  } catch (error) {
    // Clean up on partial failure so we don't leak temp directories
    rmSync(tmpDir, { recursive: true, force: true });
    throw error;
  }
}
 
function formatFinding(finding: Finding): string {
  const loc = finding.location ? `${finding.location.path}:${finding.location.startLine}` : 'general';
  return `${loc} "${finding.title}"`;
}
 
function formatFindingProcessingEvent(event: FindingProcessingEvent): string {
  const reason = event.reason ? ` (${event.reason})` : '';
  const replacement = event.replacement ? ` -> ${formatFinding(event.replacement)}` : '';
  return `Finding ${event.stage}:${event.action} ${formatFinding(event.finding)}${replacement}${reason}`;
}
 
/**
 * Run a single eval scenario through Warden's skill pipeline.
 *
 * The only thing mocked is the GitHub event payload (no real PR).
 * Everything else runs for real: git repo, diff parsing, SDK invocation,
 * agent with Read/Grep tools, and finding extraction.
 */
export async function runEvalSkill(
  meta: EvalMeta,
  options: RunEvalOptions
): Promise<EvalSkillRunResult> {
  const startTime = Date.now();
  const name = formatEvalId(meta);
  const logs: string[] = [];
 
  const log = (msg: string): void => {
    logs.push(`[${Date.now() - startTime}ms] ${msg}`);
    if (options.verbose) {
      console.log(`  [eval:${name}] ${msg}`);
    }
  };
 
  if (meta.filePaths.length === 0) {
    throw new Error(`No fixture files specified for eval: ${name}`);
  }
  log(`Fixture file(s): ${meta.filePaths.map((f) => f.split('/').pop()).join(', ')}`);
 
  let repoDir: string | undefined;
 
  try {
    repoDir = setupEvalRepo(meta, log);
 
    const context = buildLocalEventContext({
      base: 'main',
      head: 'eval',
      cwd: repoDir,
      defaultBranch: 'main',
    });
    log(`Context built: ${context.pullRequest?.files.length ?? 0} file(s) from git diff`);
 
    // Resolve skill from where setupEvalRepo placed it
    const skillSrcDir = dirname(meta.skillPath);
    const isDirectorySkill = existsSync(join(skillSrcDir, 'SKILL.md'));
    const skillPath = isDirectorySkill
      ? join(repoDir, '.warden', 'skills', basename(skillSrcDir))
      : join(repoDir, '.warden', 'skills', basename(meta.skillPath));
    const skill = await resolveSkillAsync(skillPath);
    log(`Skill resolved: ${skill.name}`);
 
    const model = options.model ?? meta.model;
    const runtime = options.runtime ?? meta.runtime;
    log(`Running skill with model: ${model} [${runtime}]`);
 
    const report = await runSkill(skill, context, {
      apiKey: options.apiKey,
      model,
      runtime,
      verbose: options.verbose,
      parallel: false,
      callbacks: options.verbose
        ? {
            onFindingProcessing: (event) => {
              log(formatFindingProcessingEvent(event));
            },
          }
        : undefined,
    });
 
    log(`Skill complete: ${report.findings.length} finding(s)`);
    for (const finding of report.findings) {
      const loc = finding.location ? ` (${finding.location.path}:${finding.location.startLine})` : '';
      log(`  [${finding.severity}] ${finding.title}${loc}`);
    }
 
    return {
      name,
      meta,
      report,
      logs,
      durationMs: Date.now() - startTime,
    };
  } finally {
    if (repoDir) {
      rmSync(repoDir, { recursive: true, force: true });
    }
  }
}