Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | 1x 1x 1x 1x 1x 1x 1x 1x 1x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 3x 1x 2x 2x 2x 2x 2x 2x 2x 2x 2x | /**
* AgentKits — Multimodal (Vision)
*
* Unified vision interface across providers.
* Send images + text to GPT-4o, Gemini, Qwen-VL, etc.
*
* Usage:
* import { createVision } from 'agentkits/vision';
* const vision = createVision({ provider: 'gemini', apiKey: '...' });
* const result = await vision.describe(imageBuffer, 'What is in this image?');
*/
import type { ChatConfig, ChatMessage } from '../llm/index.js';
// ── Types ──────────────────────────────────────────────────────────
export type VisionProvider = 'openai' | 'gemini' | 'dashscope' | 'zhipu' | 'ollama' | 'custom';
export interface VisionConfig {
provider?: VisionProvider;
model?: string;
apiKey?: string;
baseUrl?: string;
maxTokens?: number;
}
export interface VisionClient {
/** Describe an image with optional prompt */
describe(image: Buffer | string, prompt?: string): Promise<string>;
/** Ask a question about an image */
ask(image: Buffer | string, question: string): Promise<string>;
/** Compare multiple images */
compare(images: Array<Buffer | string>, prompt?: string): Promise<string>;
/** Extract text from image (OCR) */
ocr(image: Buffer | string): Promise<string>;
/** Current config */
readonly config: Readonly<VisionConfig>;
}
// ── Provider Defaults ─────────────────────────────────────────────
const DEFAULTS: Record<VisionProvider, { model: string; baseUrl: string }> = {
openai: { model: 'gpt-4o', baseUrl: 'https://api.openai.com/v1' },
gemini: { model: 'gemini-2.0-flash', baseUrl: 'https://generativelanguage.googleapis.com/v1beta/openai' },
dashscope: { model: 'qwen-vl-max', baseUrl: 'https://dashscope.aliyuncs.com/compatible-mode/v1' },
zhipu: { model: 'glm-4v', baseUrl: 'https://open.bigmodel.cn/api/paas/v4' },
ollama: { model: 'llava', baseUrl: 'http://localhost:11434/v1' },
custom: { model: 'default', baseUrl: '' },
};
function imageToDataUrl(image: Buffer | string): string {
if (typeof image === 'string') {
// Already a URL or data URL
if (image.startsWith('http') || image.startsWith('data:')) return image;
// Base64 string
return `data:image/png;base64,${image}`;
}
return `data:image/png;base64,${image.toString('base64')}`;
}
// ── Main ──────────────────────────────────────────────────────────
export function createVision(config: VisionConfig = {}): VisionClient {
const provider = config.provider ?? 'openai';
const defaults = DEFAULTS[provider];
const model = config.model ?? defaults.model;
const baseUrl = config.baseUrl ?? defaults.baseUrl;
const apiKey = config.apiKey ?? process.env.OPENAI_API_KEY ?? process.env.GEMINI_API_KEY ?? '';
const maxTokens = config.maxTokens ?? 1024;
async function callVision(images: Array<Buffer | string>, prompt: string): Promise<string> {
const imageContent = images.map(img => ({
type: 'image_url' as const,
image_url: { url: imageToDataUrl(img) },
}));
const messages = [{
role: 'user' as const,
content: [
...imageContent,
{ type: 'text' as const, text: prompt },
],
}];
const response = await fetch(`${baseUrl}/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': `Bearer ${apiKey}`,
},
body: JSON.stringify({
model,
messages,
max_tokens: maxTokens,
}),
});
if (!response.ok) {
const err = await response.text();
throw new Error(`Vision API error (${response.status}): ${err}`);
}
const data = await response.json() as any;
return data.choices?.[0]?.message?.content ?? '';
}
const client: VisionClient = {
async describe(image, prompt = 'Describe this image in detail.') {
return callVision([image], prompt);
},
async ask(image, question) {
return callVision([image], question);
},
async compare(images, prompt = 'Compare these images and describe the differences.') {
return callVision(images, prompt);
},
async ocr(image) {
return callVision([image], 'Extract ALL text from this image. Return only the text content, preserving the original formatting and layout as much as possible.');
},
get config() { return { provider, model, baseUrl, apiKey: '***', maxTokens }; },
};
return client;
}
export function listVisionProviders(): Array<{ id: VisionProvider; name: string; models: string[] }> {
return [
{ id: 'openai', name: 'OpenAI', models: ['gpt-4o', 'gpt-4o-mini'] },
{ id: 'gemini', name: 'Google Gemini', models: ['gemini-2.0-flash', 'gemini-2.5-pro'] },
{ id: 'dashscope', name: 'Alibaba DashScope', models: ['qwen-vl-max', 'qwen-vl-plus'] },
{ id: 'zhipu', name: 'Zhipu AI', models: ['glm-4v'] },
{ id: 'ollama', name: 'Ollama (local)', models: ['llava', 'bakllava'] },
{ id: 'custom', name: 'Custom endpoint', models: [] },
];
}
|