All files / src/vision index.ts

43.75% Statements 35/80
57.14% Branches 4/7
33.33% Functions 3/9
43.75% Lines 35/80

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140                                                                                  1x 1x 1x 1x 1x 1x 1x 1x                           1x 3x 3x 3x 3x 3x 3x   3x                                                                       3x 3x       3x       3x       3x       3x 3x   3x 3x   1x 2x 2x 2x 2x 2x 2x 2x 2x 2x  
/**
 * AgentKits — Multimodal (Vision)
 *
 * Unified vision interface across providers.
 * Send images + text to GPT-4o, Gemini, Qwen-VL, etc.
 *
 * Usage:
 *   import { createVision } from 'agentkits/vision';
 *   const vision = createVision({ provider: 'gemini', apiKey: '...' });
 *   const result = await vision.describe(imageBuffer, 'What is in this image?');
 */
 
import type { ChatConfig, ChatMessage } from '../llm/index.js';
 
// ── Types ──────────────────────────────────────────────────────────
 
export type VisionProvider = 'openai' | 'gemini' | 'dashscope' | 'zhipu' | 'ollama' | 'custom';
 
export interface VisionConfig {
  provider?: VisionProvider;
  model?: string;
  apiKey?: string;
  baseUrl?: string;
  maxTokens?: number;
}
 
export interface VisionClient {
  /** Describe an image with optional prompt */
  describe(image: Buffer | string, prompt?: string): Promise<string>;
  /** Ask a question about an image */
  ask(image: Buffer | string, question: string): Promise<string>;
  /** Compare multiple images */
  compare(images: Array<Buffer | string>, prompt?: string): Promise<string>;
  /** Extract text from image (OCR) */
  ocr(image: Buffer | string): Promise<string>;
  /** Current config */
  readonly config: Readonly<VisionConfig>;
}
 
// ── Provider Defaults ─────────────────────────────────────────────
 
const DEFAULTS: Record<VisionProvider, { model: string; baseUrl: string }> = {
  openai:    { model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' },
  gemini:    { model: 'gemini-2.0-flash', baseUrl: 'https://generativelanguage.googleapis.com/v1beta/openai' },
  dashscope: { model: 'qwen-vl-max', baseUrl: 'https://dashscope.aliyuncs.com/compatible-mode/v1' },
  zhipu:     { model: 'glm-4v', baseUrl: 'https://open.bigmodel.cn/api/paas/v4' },
  ollama:    { model: 'llava', baseUrl: 'http://localhost:11434/v1' },
  custom:    { model: 'default', baseUrl: '' },
};
 
function imageToDataUrl(image: Buffer | string): string {
  if (typeof image === 'string') {
    // Already a URL or data URL
    if (image.startsWith('http') || image.startsWith('data:')) return image;
    // Base64 string
    return `data:image/png;base64,${image}`;
  }
  return `data:image/png;base64,${image.toString('base64')}`;
}
 
// ── Main ──────────────────────────────────────────────────────────
 
export function createVision(config: VisionConfig = {}): VisionClient {
  const provider = config.provider ?? 'openai';
  const defaults = DEFAULTS[provider];
  const model = config.model ?? defaults.model;
  const baseUrl = config.baseUrl ?? defaults.baseUrl;
  const apiKey = config.apiKey ?? process.env.OPENAI_API_KEY ?? process.env.GEMINI_API_KEY ?? '';
  const maxTokens = config.maxTokens ?? 1024;
 
  async function callVision(images: Array<Buffer | string>, prompt: string): Promise<string> {
    const imageContent = images.map(img => ({
      type: 'image_url' as const,
      image_url: { url: imageToDataUrl(img) },
    }));
 
    const messages = [{
      role: 'user' as const,
      content: [
        ...imageContent,
        { type: 'text' as const, text: prompt },
      ],
    }];
 
    const response = await fetch(`${baseUrl}/chat/completions`, {
      method: 'POST',
      headers: {
        'Content-Type': 'application/json',
        'Authorization': `Bearer ${apiKey}`,
      },
      body: JSON.stringify({
        model,
        messages,
        max_tokens: maxTokens,
      }),
    });
 
    if (!response.ok) {
      const err = await response.text();
      throw new Error(`Vision API error (${response.status}): ${err}`);
    }
 
    const data = await response.json() as any;
    return data.choices?.[0]?.message?.content ?? '';
  }
 
  const client: VisionClient = {
    async describe(image, prompt = 'Describe this image in detail.') {
      return callVision([image], prompt);
    },
 
    async ask(image, question) {
      return callVision([image], question);
    },
 
    async compare(images, prompt = 'Compare these images and describe the differences.') {
      return callVision(images, prompt);
    },
 
    async ocr(image) {
      return callVision([image], 'Extract ALL text from this image. Return only the text content, preserving the original formatting and layout as much as possible.');
    },
 
    get config() { return { provider, model, baseUrl, apiKey: '***', maxTokens }; },
  };
 
  return client;
}
 
export function listVisionProviders(): Array<{ id: VisionProvider; name: string; models: string[] }> {
  return [
    { id: 'openai', name: 'OpenAI', models: ['gpt-4o', 'gpt-4o-mini'] },
    { id: 'gemini', name: 'Google Gemini', models: ['gemini-2.0-flash', 'gemini-2.5-pro'] },
    { id: 'dashscope', name: 'Alibaba DashScope', models: ['qwen-vl-max', 'qwen-vl-plus'] },
    { id: 'zhipu', name: 'Zhipu AI', models: ['glm-4v'] },
    { id: 'ollama', name: 'Ollama (local)', models: ['llava', 'bakllava'] },
    { id: 'custom', name: 'Custom endpoint', models: [] },
  ];
}