/** * lib/ollama.ts * Zentraler Ollama-Client: Text-Chat und Vision/OCR-Aufrufe. * * Neu angelegte Agenten nutzen diesen Client statt inline-fetch. * Bestehende Agenten (ollama-claim-extractor, verifier) können schrittweise migriert werden. * * Konfiguration: * OLLAMA_HOST → Ollama-URL (Standard: http://localhost:11434) */ export const OLLAMA_HOST = process.env.OLLAMA_HOST ?? "http://localhost:11434"; export type OllamaMessage = { role: "system" | "user" | "assistant"; content: string; images?: string[]; // base64-kodierte Bilder (Vision-Aufrufe) }; export type OllamaResult = { text: string; promptTokens: number; completionTokens: number; latencyMs: number; }; // --------------------------------------------------------------------------- // Intern // --------------------------------------------------------------------------- const MAX_RETRIES = 3; const RETRY_DELAY_MS = 15_000; function sleep(ms: number): Promise { return new Promise(resolve => setTimeout(resolve, ms)); } // --------------------------------------------------------------------------- // Haupt-Aufruf // --------------------------------------------------------------------------- /** * Generischer Ollama-Chat (Text oder Vision). * Für Vision: images-Felder in den Messages setzen, oder callOllamaVision() nutzen. */ export async function callOllamaChat( model: string, messages: OllamaMessage[], options?: { /** JSON-Schema für structured output (Ollama >= 0.5) */ format?: "json" | Record; temperature?: number; numCtx?: number; numPredict?: number; /** * Thinking-Mode für qwen3/deepseek-r1-Modelle (Standard: false). * false → /no_think → nur Antwort, kein Chain-of-Thought * true → Modell denkt zuerst, Antwort in content; thinking in separatem Feld */ think?: boolean; signal?: AbortSignal; } ): Promise { const t0 = Date.now(); let lastError: unknown; // qwen3 und deepseek-r1 haben Thinking-Mode standardmäßig an. // Für strukturierte Ausgaben (JSON, Extraktion) ist Thinking unerwünscht. const think = options?.think ?? false; for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) { try { const body: Record = { model, messages, stream: false, think, options: { temperature: options?.temperature ?? 0.1, ...(options?.numCtx ? { num_ctx: options.numCtx } : {}), ...(options?.numPredict ? { num_predict: options.numPredict } : {}), }, }; if (options?.format !== undefined) { body.format = options.format; } const resp = await fetch(`${OLLAMA_HOST}/api/chat`, { method: "POST", headers: { "Content-Type": "application/json" }, body: JSON.stringify(body), signal: options?.signal, }); if (!resp.ok) { const errText = await resp.text().catch(() => ""); throw new Error(`Ollama HTTP ${resp.status}: ${errText}`); } const data = await resp.json() as { message?: { content?: string; thinking?: string }; prompt_eval_count?: number; eval_count?: number; }; // Bei Thinking-Modellen (qwen3, deepseek-r1): wenn content leer, // Fallback auf thinking-Feld (passiert bei sehr kurzen Antworten). const text = data.message?.content?.trim() || (think ? data.message?.thinking?.trim() : "") || ""; return { text, promptTokens: data.prompt_eval_count ?? 0, completionTokens: data.eval_count ?? 0, latencyMs: Date.now() - t0, }; } catch (err) { lastError = err; if (attempt < MAX_RETRIES) await sleep(RETRY_DELAY_MS); } } throw new Error( `Ollama fehlgeschlagen nach ${MAX_RETRIES} Versuchen: ${ lastError instanceof Error ? lastError.message : String(lastError) }` ); } // --------------------------------------------------------------------------- // Vision / OCR // --------------------------------------------------------------------------- /** * Ollama-Aufruf mit Bild-Input (Vision / OCR). * * Empfohlene Modelle (passen alle auf RTX 3090 24GB): * fredrezones55/chandra-ocr-2:patch 5.8GB — OCR-spezialisiert, Dokumente/Scans * qwen3-vl:latest 6.1GB — Vision-Language, Bildbeschreibung + OCR * qwen2.5vl:7b 6.0GB — Alternative zu qwen3-vl * minicpm-v:latest 5.5GB — Leichtgewichtig, gut für einfache OCR * * @param imageSource Absoluter Dateipfad ("/…") oder base64-String */ export async function callOllamaVision( model: string, imageSource: string, prompt: string, options?: { systemPrompt?: string; temperature?: number; signal?: AbortSignal; } ): Promise { let imageBase64: string; if (imageSource.startsWith("/") || imageSource.startsWith("~")) { const { readFile } = await import("node:fs/promises"); const resolvedPath = imageSource.startsWith("~") ? imageSource.replace(/^~/, process.env.HOME ?? "/root") : imageSource; const buf = await readFile(resolvedPath); imageBase64 = buf.toString("base64"); } else { imageBase64 = imageSource; // schon base64 } const messages: OllamaMessage[] = []; if (options?.systemPrompt) { messages.push({ role: "system", content: options.systemPrompt }); } messages.push({ role: "user", content: prompt, images: [imageBase64] }); return callOllamaChat(model, messages, { temperature: options?.temperature ?? 0.1, signal: options?.signal, }); } // --------------------------------------------------------------------------- // Modell-Infos (lokal installiert, passend für RTX 3090 24GB) // --------------------------------------------------------------------------- /** Alle bekannten lokalen Ollama-Modelle nach Kategorie. */ export const LOCAL_CATALOG = { // --- Text / Reasoning --- text: { /** 17GB — Haupt-Allrounder, 1 GPU */ "qwen3.5:27b": { vramGB: 17, gpus: 1 }, /** 19GB — Eingebautes Reasoning (DeepSeek R1), 1 GPU */ "deepseek-r1:32b": { vramGB: 19, gpus: 1 }, /** 18GB — Code + allgemein, 128k-Kontext, 1 GPU */ "qwen3-coder-30b-128k:latest": { vramGB: 18, gpus: 1 }, /** 18GB — Optimierte GPU-Variante des Qwen3-Coders, 1 GPU */ "qwen3-coder-30b-gpu:latest": { vramGB: 18, gpus: 1 }, /** 18GB — GLM-4.7 Flash, chinesisches Modell, 1 GPU */ "glm-4.7-flash:latest": { vramGB: 18, gpus: 1 }, /** 17GB — Gemma4 26B von Google, 1 GPU */ "gemma4:26b": { vramGB: 17, gpus: 1 }, /** 9.6GB — Gemma4 E4B (Effizienz-Variante), 1 GPU */ "gemma4:e4b": { vramGB: 9.6, gpus: 1 }, /** 9.0GB — Qwen2.5 14B Instruct, 1 GPU */ "qwen2.5:14b-instruct": { vramGB: 9, gpus: 1 }, /** 5.2GB — Qwen3 8B, schnell für einfache Tasks */ "qwen3:8b": { vramGB: 5.2, gpus: 1 }, /** 4.9GB — Llama 3.1 8B */ "llama3.1:8b": { vramGB: 4.9, gpus: 1 }, /** 7.1GB — Mistral Nemo */ "mistral-nemo:latest": { vramGB: 7.1, gpus: 1 }, }, // --- Code --- code: { /** 9.0GB — Qwen2.5-Coder 14B, 1 GPU */ "qwen2.5-coder:14b": { vramGB: 9, gpus: 1 }, /** 4.7GB — Qwen2.5-Coder 7B, schnell */ "qwen2.5-coder:7b": { vramGB: 4.7, gpus: 1 }, }, // --- Vision / OCR --- vision: { /** 5.8GB — OCR-spezialisiert (Chandra OCR 2) */ "fredrezones55/chandra-ocr-2:patch": { vramGB: 5.8, gpus: 1 }, /** 6.1GB — Qwen3 Vision-Language Model */ "qwen3-vl:latest": { vramGB: 6.1, gpus: 1 }, /** 6.0GB — Qwen2.5 Vision-Language 7B */ "qwen2.5vl:7b": { vramGB: 6, gpus: 1 }, /** 5.5GB — MiniCPM-V, leichtgewichtig */ "minicpm-v:latest": { vramGB: 5.5, gpus: 1 }, /** 3.3GB — Qwen3-VL 4B, sehr klein */ "qwen3-vl:4b": { vramGB: 3.3, gpus: 1 }, }, // --- Embedding --- embedding: { /** 4.7GB — Qwen3 Embedding */ "qwen3-embedding:latest": { vramGB: 4.7, gpus: 1 }, }, } as const;