All files / src/modules guardrails.ts

100% Statements 146/146
82.5% Branches 33/40
100% Functions 9/9
100% Lines 146/146

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264                                                                                                                                                                            1x 1x 1x 1x 1x 1x 1x 1x       1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x       1x     1x       1x 15x 15x   15x 15x 15x   15x 15x   15x 15x 15x 15x 15x 15x   15x   15x 13x 13x   13x   78x 78x 78x 4x 4x 4x 4x 4x 4x 4x 4x 4x 4x 4x 4x 78x   13x 13x   15x 13x 13x 143x 143x 4x 4x 4x 4x 4x 4x 4x 143x 13x 13x   15x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x   15x 14x 14x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 1x 14x 14x   15x 14x 14x   14x 13x 13x 13x 13x   14x 13x 13x   14x 1x 1x   14x   14x 14x   14x 14x   15x 15x 15x 15x 15x 1x 1x   15x 15x 13x 13x 15x 1x 1x 15x 15x 4x 4x 15x 15x  
/**
 * AgentKits — Guardrails Module
 *
 * Input/output validation for LLM pipelines.
 * - PII detection (emails, phones, SSNs, credit cards)
 * - Prompt injection detection
 * - Toxicity check (keyword-based)
 * - Factuality check (placeholder for external verifiers)
 *
 * Usage:
 *   import { createGuardrails } from 'agentkits/guardrails';
 *   const guard = createGuardrails({ blockOnViolation: true });
 *   const result = guard.validateInput('my email is foo@bar.com');
 */
 
// ── Types ──────────────────────────────────────────────────────────
 
export type ViolationSeverity = 'warn' | 'block';
 
export interface Violation {
  rule: string;
  severity: ViolationSeverity;
  message: string;
  match?: string;
}
 
export interface ValidationResult {
  passed: boolean;
  violations: Violation[];
  sanitized?: string;
}
 
export interface GuardrailRule {
  name: string;
  enabled: boolean;
  severity: ViolationSeverity;
}
 
export interface GuardrailsConfig {
  /** Default severity for violations (default: 'warn') */
  defaultSeverity?: ViolationSeverity;
  /** Block the request if any violation found (default: false) */
  blockOnViolation?: boolean;
  /** PII detection rules */
  pii?: {
    enabled?: boolean;
    severity?: ViolationSeverity;
    /** Redact PII in sanitized output (default: true) */
    redact?: boolean;
  };
  /** Prompt injection detection */
  injection?: {
    enabled?: boolean;
    severity?: ViolationSeverity;
  };
  /** Toxicity detection (output) */
  toxicity?: {
    enabled?: boolean;
    severity?: ViolationSeverity;
    /** Custom blocked words/phrases */
    customBlocklist?: string[];
  };
  /** Custom rules */
  customRules?: Array<{
    name: string;
    pattern: RegExp;
    severity?: ViolationSeverity;
    message: string;
    /** Apply to 'input' | 'output' | 'both' */
    scope?: 'input' | 'output' | 'both';
  }>;
}
 
export interface GuardrailsClient {
  /** Validate input text before sending to LLM */
  validateInput(text: string): ValidationResult;
  /** Validate output text from LLM */
  validateOutput(text: string): ValidationResult;
  /** Validate both input and output */
  validate(text: string, direction: 'input' | 'output'): ValidationResult;
  /** Get active rules */
  readonly rules: GuardrailRule[];
}
 
// ── PII Patterns ──────────────────────────────────────────────────
 
const PII_PATTERNS: Array<{ name: string; pattern: RegExp; replacement: string }> = [
  { name: 'email', pattern: /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g, replacement: '[EMAIL]' },
  { name: 'phone_us', pattern: /(?:\+?1[-.\s]?)?\(?[0-9]{3}\)?[-.\s]?[0-9]{3}[-.\s]?[0-9]{4}/g, replacement: '[PHONE]' },
  { name: 'phone_intl', pattern: /\+[0-9]{1,3}[-.\s]?[0-9]{4,14}/g, replacement: '[PHONE]' },
  { name: 'ssn', pattern: /\b[0-9]{3}-[0-9]{2}-[0-9]{4}\b/g, replacement: '[SSN]' },
  { name: 'credit_card', pattern: /\b(?:[0-9]{4}[-\s]?){3}[0-9]{4}\b/g, replacement: '[CREDIT_CARD]' },
  { name: 'ip_address', pattern: /\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b/g, replacement: '[IP]' },
];
 
// ── Injection Patterns ────────────────────────────────────────────
 
const INJECTION_PATTERNS: RegExp[] = [
  /ignore\s+(all\s+)?previous\s+(instructions?|prompts?|rules?)/i,
  /disregard\s+(all\s+)?previous/i,
  /forget\s+(all\s+)?(your\s+)?(instructions?|rules?|prompts?)/i,
  /you\s+are\s+now\s+(a|an)\s+/i,
  /system\s*:\s*/i,
  /\]\]\s*>\s*/i,
  /\<\/?system\>/i,
  /act\s+as\s+(if|though)\s+you\s+(have\s+)?no\s+(restrictions?|rules?|limitations?)/i,
  /pretend\s+(you\s+are|to\s+be)\s+(a\s+)?(?:unrestricted|unfiltered|jailbroken)/i,
  /do\s+anything\s+now/i,
  /bypass\s+(your\s+)?(safety|content|ethical)\s+(filters?|guidelines?|restrictions?)/i,
];
 
// ── Toxicity Keywords ─────────────────────────────────────────────
 
const DEFAULT_TOXICITY_BLOCKLIST: string[] = [
  // Placeholder — real production systems use ML classifiers.
  // We keep a minimal keyword list; users should supply their own via customBlocklist.
];
 
// ── Factory ───────────────────────────────────────────────────────
 
export function createGuardrails(config: GuardrailsConfig = {}): GuardrailsClient {
  const defaultSeverity = config.defaultSeverity ?? 'warn';
  const blockOnViolation = config.blockOnViolation ?? false;
 
  const piiEnabled = config.pii?.enabled !== false;
  const piiSeverity = config.pii?.severity ?? defaultSeverity;
  const piiRedact = config.pii?.redact !== false;
 
  const injectionEnabled = config.injection?.enabled !== false;
  const injectionSeverity = config.injection?.severity ?? 'block';
 
  const toxicityEnabled = config.toxicity?.enabled !== false;
  const toxicitySeverity = config.toxicity?.severity ?? defaultSeverity;
  const toxicityBlocklist = [
    ...DEFAULT_TOXICITY_BLOCKLIST,
    ...(config.toxicity?.customBlocklist ?? []),
  ];
 
  const customRules = config.customRules ?? [];
 
  function checkPII(text: string): { violations: Violation[]; sanitized: string } {
    const violations: Violation[] = [];
    let sanitized = text;
 
    for (const { name, pattern, replacement } of PII_PATTERNS) {
      // Reset regex lastIndex
      pattern.lastIndex = 0;
      const matches = text.match(pattern);
      if (matches) {
        for (const m of matches) {
          violations.push({
            rule: `pii:${name}`,
            severity: piiSeverity,
            message: `PII detected: ${name}`,
            match: m,
          });
        }
        if (piiRedact) {
          sanitized = sanitized.replace(pattern, replacement);
        }
      }
    }
 
    return { violations, sanitized };
  }
 
  function checkInjection(text: string): Violation[] {
    const violations: Violation[] = [];
    for (const pattern of INJECTION_PATTERNS) {
      const match = text.match(pattern);
      if (match) {
        violations.push({
          rule: 'injection',
          severity: injectionSeverity,
          message: 'Potential prompt injection detected',
          match: match[0],
        });
      }
    }
    return violations;
  }
 
  function checkToxicity(text: string): Violation[] {
    const violations: Violation[] = [];
    const lower = text.toLowerCase();
    for (const word of toxicityBlocklist) {
      if (lower.includes(word.toLowerCase())) {
        violations.push({
          rule: 'toxicity',
          severity: toxicitySeverity,
          message: `Blocked content detected`,
          match: word,
        });
      }
    }
    return violations;
  }
 
  function checkCustom(text: string, direction: 'input' | 'output'): Violation[] {
    const violations: Violation[] = [];
    for (const rule of customRules) {
      const scope = rule.scope ?? 'both';
      if (scope !== 'both' && scope !== direction) continue;
      const match = text.match(rule.pattern);
      if (match) {
        violations.push({
          rule: `custom:${rule.name}`,
          severity: rule.severity ?? defaultSeverity,
          message: rule.message,
          match: match[0],
        });
      }
    }
    return violations;
  }
 
  function validate(text: string, direction: 'input' | 'output'): ValidationResult {
    const violations: Violation[] = [];
    let sanitized = text;
 
    if (piiEnabled) {
      const piiResult = checkPII(text);
      violations.push(...piiResult.violations);
      sanitized = piiResult.sanitized;
    }
 
    if (direction === 'input' && injectionEnabled) {
      violations.push(...checkInjection(text));
    }
 
    if (direction === 'output' && toxicityEnabled) {
      violations.push(...checkToxicity(text));
    }
 
    violations.push(...checkCustom(text, direction));
 
    const hasBlock = violations.some(v => v.severity === 'block');
    const passed = blockOnViolation ? violations.length === 0 : !hasBlock;
 
    return { passed, violations, sanitized };
  }
 
  const activeRules: GuardrailRule[] = [];
  if (piiEnabled) activeRules.push({ name: 'pii', enabled: true, severity: piiSeverity });
  if (injectionEnabled) activeRules.push({ name: 'injection', enabled: true, severity: injectionSeverity });
  if (toxicityEnabled) activeRules.push({ name: 'toxicity', enabled: true, severity: toxicitySeverity });
  for (const r of customRules) {
    activeRules.push({ name: `custom:${r.name}`, enabled: true, severity: r.severity ?? defaultSeverity });
  }
 
  return {
    validateInput(text: string): ValidationResult {
      return validate(text, 'input');
    },
    validateOutput(text: string): ValidationResult {
      return validate(text, 'output');
    },
    validate,
    get rules() {
      return activeRules;
    },
  };
}