Vollständiges Multi-Agenten-System für Fact-Checking, Artikelschreiben und Argumentationsanalyse. Zwei Backends: llama.cpp (★ bevorzugt) und Ollama. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
237 lines
7.7 KiB
TypeScript
237 lines
7.7 KiB
TypeScript
/**
|
|
* lib/ollama.ts
|
|
* Zentraler Ollama-Client: Text-Chat und Vision/OCR-Aufrufe.
|
|
*
|
|
* Neu angelegte Agenten nutzen diesen Client statt inline-fetch.
|
|
* Bestehende Agenten (ollama-claim-extractor, verifier) können schrittweise migriert werden.
|
|
*
|
|
* Konfiguration:
|
|
* OLLAMA_HOST → Ollama-URL (Standard: http://localhost:11434)
|
|
*/
|
|
|
|
export const OLLAMA_HOST = process.env.OLLAMA_HOST ?? "http://localhost:11434";
|
|
|
|
export type OllamaMessage = {
|
|
role: "system" | "user" | "assistant";
|
|
content: string;
|
|
images?: string[]; // base64-kodierte Bilder (Vision-Aufrufe)
|
|
};
|
|
|
|
export type OllamaResult = {
|
|
text: string;
|
|
promptTokens: number;
|
|
completionTokens: number;
|
|
latencyMs: number;
|
|
};
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Intern
|
|
// ---------------------------------------------------------------------------
|
|
|
|
const MAX_RETRIES = 3;
|
|
const RETRY_DELAY_MS = 15_000;
|
|
|
|
function sleep(ms: number): Promise<void> {
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Haupt-Aufruf
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Generischer Ollama-Chat (Text oder Vision).
|
|
* Für Vision: images-Felder in den Messages setzen, oder callOllamaVision() nutzen.
|
|
*/
|
|
export async function callOllamaChat(
|
|
model: string,
|
|
messages: OllamaMessage[],
|
|
options?: {
|
|
/** JSON-Schema für structured output (Ollama >= 0.5) */
|
|
format?: "json" | Record<string, unknown>;
|
|
temperature?: number;
|
|
numCtx?: number;
|
|
numPredict?: number;
|
|
/**
|
|
* Thinking-Mode für qwen3/deepseek-r1-Modelle (Standard: false).
|
|
* false → /no_think → nur Antwort, kein Chain-of-Thought
|
|
* true → Modell denkt zuerst, Antwort in content; thinking in separatem Feld
|
|
*/
|
|
think?: boolean;
|
|
signal?: AbortSignal;
|
|
}
|
|
): Promise<OllamaResult> {
|
|
const t0 = Date.now();
|
|
let lastError: unknown;
|
|
|
|
// qwen3 und deepseek-r1 haben Thinking-Mode standardmäßig an.
|
|
// Für strukturierte Ausgaben (JSON, Extraktion) ist Thinking unerwünscht.
|
|
const think = options?.think ?? false;
|
|
|
|
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
|
try {
|
|
const body: Record<string, unknown> = {
|
|
model,
|
|
messages,
|
|
stream: false,
|
|
think,
|
|
options: {
|
|
temperature: options?.temperature ?? 0.1,
|
|
...(options?.numCtx ? { num_ctx: options.numCtx } : {}),
|
|
...(options?.numPredict ? { num_predict: options.numPredict } : {}),
|
|
},
|
|
};
|
|
if (options?.format !== undefined) {
|
|
body.format = options.format;
|
|
}
|
|
|
|
const resp = await fetch(`${OLLAMA_HOST}/api/chat`, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify(body),
|
|
signal: options?.signal,
|
|
});
|
|
|
|
if (!resp.ok) {
|
|
const errText = await resp.text().catch(() => "");
|
|
throw new Error(`Ollama HTTP ${resp.status}: ${errText}`);
|
|
}
|
|
|
|
const data = await resp.json() as {
|
|
message?: { content?: string; thinking?: string };
|
|
prompt_eval_count?: number;
|
|
eval_count?: number;
|
|
};
|
|
|
|
// Bei Thinking-Modellen (qwen3, deepseek-r1): wenn content leer,
|
|
// Fallback auf thinking-Feld (passiert bei sehr kurzen Antworten).
|
|
const text = data.message?.content?.trim()
|
|
|| (think ? data.message?.thinking?.trim() : "")
|
|
|| "";
|
|
|
|
return {
|
|
text,
|
|
promptTokens: data.prompt_eval_count ?? 0,
|
|
completionTokens: data.eval_count ?? 0,
|
|
latencyMs: Date.now() - t0,
|
|
};
|
|
} catch (err) {
|
|
lastError = err;
|
|
if (attempt < MAX_RETRIES) await sleep(RETRY_DELAY_MS);
|
|
}
|
|
}
|
|
|
|
throw new Error(
|
|
`Ollama fehlgeschlagen nach ${MAX_RETRIES} Versuchen: ${
|
|
lastError instanceof Error ? lastError.message : String(lastError)
|
|
}`
|
|
);
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Vision / OCR
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/**
|
|
* Ollama-Aufruf mit Bild-Input (Vision / OCR).
|
|
*
|
|
* Empfohlene Modelle (passen alle auf RTX 3090 24GB):
|
|
* fredrezones55/chandra-ocr-2:patch 5.8GB — OCR-spezialisiert, Dokumente/Scans
|
|
* qwen3-vl:latest 6.1GB — Vision-Language, Bildbeschreibung + OCR
|
|
* qwen2.5vl:7b 6.0GB — Alternative zu qwen3-vl
|
|
* minicpm-v:latest 5.5GB — Leichtgewichtig, gut für einfache OCR
|
|
*
|
|
* @param imageSource Absoluter Dateipfad ("/…") oder base64-String
|
|
*/
|
|
export async function callOllamaVision(
|
|
model: string,
|
|
imageSource: string,
|
|
prompt: string,
|
|
options?: {
|
|
systemPrompt?: string;
|
|
temperature?: number;
|
|
signal?: AbortSignal;
|
|
}
|
|
): Promise<OllamaResult> {
|
|
let imageBase64: string;
|
|
|
|
if (imageSource.startsWith("/") || imageSource.startsWith("~")) {
|
|
const { readFile } = await import("node:fs/promises");
|
|
const resolvedPath = imageSource.startsWith("~")
|
|
? imageSource.replace(/^~/, process.env.HOME ?? "/root")
|
|
: imageSource;
|
|
const buf = await readFile(resolvedPath);
|
|
imageBase64 = buf.toString("base64");
|
|
} else {
|
|
imageBase64 = imageSource; // schon base64
|
|
}
|
|
|
|
const messages: OllamaMessage[] = [];
|
|
if (options?.systemPrompt) {
|
|
messages.push({ role: "system", content: options.systemPrompt });
|
|
}
|
|
messages.push({ role: "user", content: prompt, images: [imageBase64] });
|
|
|
|
return callOllamaChat(model, messages, {
|
|
temperature: options?.temperature ?? 0.1,
|
|
signal: options?.signal,
|
|
});
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Modell-Infos (lokal installiert, passend für RTX 3090 24GB)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/** Alle bekannten lokalen Ollama-Modelle nach Kategorie. */
|
|
export const LOCAL_CATALOG = {
|
|
// --- Text / Reasoning ---
|
|
text: {
|
|
/** 17GB — Haupt-Allrounder, 1 GPU */
|
|
"qwen3.5:27b": { vramGB: 17, gpus: 1 },
|
|
/** 19GB — Eingebautes Reasoning (DeepSeek R1), 1 GPU */
|
|
"deepseek-r1:32b": { vramGB: 19, gpus: 1 },
|
|
/** 18GB — Code + allgemein, 128k-Kontext, 1 GPU */
|
|
"qwen3-coder-30b-128k:latest": { vramGB: 18, gpus: 1 },
|
|
/** 18GB — Optimierte GPU-Variante des Qwen3-Coders, 1 GPU */
|
|
"qwen3-coder-30b-gpu:latest": { vramGB: 18, gpus: 1 },
|
|
/** 18GB — GLM-4.7 Flash, chinesisches Modell, 1 GPU */
|
|
"glm-4.7-flash:latest": { vramGB: 18, gpus: 1 },
|
|
/** 17GB — Gemma4 26B von Google, 1 GPU */
|
|
"gemma4:26b": { vramGB: 17, gpus: 1 },
|
|
/** 9.6GB — Gemma4 E4B (Effizienz-Variante), 1 GPU */
|
|
"gemma4:e4b": { vramGB: 9.6, gpus: 1 },
|
|
/** 9.0GB — Qwen2.5 14B Instruct, 1 GPU */
|
|
"qwen2.5:14b-instruct": { vramGB: 9, gpus: 1 },
|
|
/** 5.2GB — Qwen3 8B, schnell für einfache Tasks */
|
|
"qwen3:8b": { vramGB: 5.2, gpus: 1 },
|
|
/** 4.9GB — Llama 3.1 8B */
|
|
"llama3.1:8b": { vramGB: 4.9, gpus: 1 },
|
|
/** 7.1GB — Mistral Nemo */
|
|
"mistral-nemo:latest": { vramGB: 7.1, gpus: 1 },
|
|
},
|
|
// --- Code ---
|
|
code: {
|
|
/** 9.0GB — Qwen2.5-Coder 14B, 1 GPU */
|
|
"qwen2.5-coder:14b": { vramGB: 9, gpus: 1 },
|
|
/** 4.7GB — Qwen2.5-Coder 7B, schnell */
|
|
"qwen2.5-coder:7b": { vramGB: 4.7, gpus: 1 },
|
|
},
|
|
// --- Vision / OCR ---
|
|
vision: {
|
|
/** 5.8GB — OCR-spezialisiert (Chandra OCR 2) */
|
|
"fredrezones55/chandra-ocr-2:patch": { vramGB: 5.8, gpus: 1 },
|
|
/** 6.1GB — Qwen3 Vision-Language Model */
|
|
"qwen3-vl:latest": { vramGB: 6.1, gpus: 1 },
|
|
/** 6.0GB — Qwen2.5 Vision-Language 7B */
|
|
"qwen2.5vl:7b": { vramGB: 6, gpus: 1 },
|
|
/** 5.5GB — MiniCPM-V, leichtgewichtig */
|
|
"minicpm-v:latest": { vramGB: 5.5, gpus: 1 },
|
|
/** 3.3GB — Qwen3-VL 4B, sehr klein */
|
|
"qwen3-vl:4b": { vramGB: 3.3, gpus: 1 },
|
|
},
|
|
// --- Embedding ---
|
|
embedding: {
|
|
/** 4.7GB — Qwen3 Embedding */
|
|
"qwen3-embedding:latest": { vramGB: 4.7, gpus: 1 },
|
|
},
|
|
} as const;
|