Text_Agent/lib/ollama.ts

/**
 * lib/ollama.ts
 * Zentraler Ollama-Client: Text-Chat und Vision/OCR-Aufrufe.
 *
 * Neu angelegte Agenten nutzen diesen Client statt inline-fetch.
 * Bestehende Agenten (ollama-claim-extractor, verifier) können schrittweise migriert werden.
 *
 * Konfiguration:
 *   OLLAMA_HOST   → Ollama-URL (Standard: http://localhost:11434)
 */

export const OLLAMA_HOST = process.env.OLLAMA_HOST ?? "http://localhost:11434";

export type OllamaMessage = {
	role: "system" | "user" | "assistant";
	content: string;
	images?: string[]; // base64-kodierte Bilder (Vision-Aufrufe)
};

export type OllamaResult = {
	text: string;
	promptTokens: number;
	completionTokens: number;
	latencyMs: number;
};

// ---------------------------------------------------------------------------
// Intern
// ---------------------------------------------------------------------------

const MAX_RETRIES = 3;
const RETRY_DELAY_MS = 15_000;

function sleep(ms: number): Promise<void> {
	return new Promise(resolve => setTimeout(resolve, ms));
}

// ---------------------------------------------------------------------------
// Haupt-Aufruf
// ---------------------------------------------------------------------------

/**
 * Generischer Ollama-Chat (Text oder Vision).
 * Für Vision: images-Felder in den Messages setzen, oder callOllamaVision() nutzen.
 */
export async function callOllamaChat(
	model: string,
	messages: OllamaMessage[],
	options?: {
		/** JSON-Schema für structured output (Ollama >= 0.5) */
		format?: "json" | Record<string, unknown>;
		temperature?: number;
		numCtx?: number;
		numPredict?: number;
		/**
		 * Thinking-Mode für qwen3/deepseek-r1-Modelle (Standard: false).
		 * false → /no_think → nur Antwort, kein Chain-of-Thought
		 * true  → Modell denkt zuerst, Antwort in content; thinking in separatem Feld
		 */
		think?: boolean;
		signal?: AbortSignal;
	}
): Promise<OllamaResult> {
	const t0 = Date.now();
	let lastError: unknown;

	// qwen3 und deepseek-r1 haben Thinking-Mode standardmäßig an.
	// Für strukturierte Ausgaben (JSON, Extraktion) ist Thinking unerwünscht.
	const think = options?.think ?? false;

	for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
		try {
			const body: Record<string, unknown> = {
				model,
				messages,
				stream: false,
				think,
				options: {
					temperature: options?.temperature ?? 0.1,
					...(options?.numCtx    ? { num_ctx:     options.numCtx    } : {}),
					...(options?.numPredict ? { num_predict: options.numPredict } : {}),
				},
			};
			if (options?.format !== undefined) {
				body.format = options.format;
			}

			const resp = await fetch(`${OLLAMA_HOST}/api/chat`, {
				method: "POST",
				headers: { "Content-Type": "application/json" },
				body: JSON.stringify(body),
				signal: options?.signal,
			});

			if (!resp.ok) {
				const errText = await resp.text().catch(() => "");
				throw new Error(`Ollama HTTP ${resp.status}: ${errText}`);
			}

			const data = await resp.json() as {
				message?: { content?: string; thinking?: string };
				prompt_eval_count?: number;
				eval_count?: number;
			};

			// Bei Thinking-Modellen (qwen3, deepseek-r1): wenn content leer,
			// Fallback auf thinking-Feld (passiert bei sehr kurzen Antworten).
			const text = data.message?.content?.trim()
				|| (think ? data.message?.thinking?.trim() : "")
				|| "";

			return {
				text,
				promptTokens:     data.prompt_eval_count ?? 0,
				completionTokens: data.eval_count        ?? 0,
				latencyMs: Date.now() - t0,
			};
		} catch (err) {
			lastError = err;
			if (attempt < MAX_RETRIES) await sleep(RETRY_DELAY_MS);
		}
	}

	throw new Error(
		`Ollama fehlgeschlagen nach ${MAX_RETRIES} Versuchen: ${
			lastError instanceof Error ? lastError.message : String(lastError)
		}`
	);
}

// ---------------------------------------------------------------------------
// Vision / OCR
// ---------------------------------------------------------------------------

/**
 * Ollama-Aufruf mit Bild-Input (Vision / OCR).
 *
 * Empfohlene Modelle (passen alle auf RTX 3090 24GB):
 *   fredrezones55/chandra-ocr-2:patch   5.8GB  — OCR-spezialisiert, Dokumente/Scans
 *   qwen3-vl:latest                     6.1GB  — Vision-Language, Bildbeschreibung + OCR
 *   qwen2.5vl:7b                        6.0GB  — Alternative zu qwen3-vl
 *   minicpm-v:latest                    5.5GB  — Leichtgewichtig, gut für einfache OCR
 *
 * @param imageSource  Absoluter Dateipfad ("/…") oder base64-String
 */
export async function callOllamaVision(
	model: string,
	imageSource: string,
	prompt: string,
	options?: {
		systemPrompt?: string;
		temperature?: number;
		signal?: AbortSignal;
	}
): Promise<OllamaResult> {
	let imageBase64: string;

	if (imageSource.startsWith("/") || imageSource.startsWith("~")) {
		const { readFile } = await import("node:fs/promises");
		const resolvedPath = imageSource.startsWith("~")
			? imageSource.replace(/^~/, process.env.HOME ?? "/root")
			: imageSource;
		const buf = await readFile(resolvedPath);
		imageBase64 = buf.toString("base64");
	} else {
		imageBase64 = imageSource; // schon base64
	}

	const messages: OllamaMessage[] = [];
	if (options?.systemPrompt) {
		messages.push({ role: "system", content: options.systemPrompt });
	}
	messages.push({ role: "user", content: prompt, images: [imageBase64] });

	return callOllamaChat(model, messages, {
		temperature: options?.temperature ?? 0.1,
		signal: options?.signal,
	});
}

// ---------------------------------------------------------------------------
// Modell-Infos (lokal installiert, passend für RTX 3090 24GB)
// ---------------------------------------------------------------------------

/** Alle bekannten lokalen Ollama-Modelle nach Kategorie. */
export const LOCAL_CATALOG = {
	// --- Text / Reasoning ---
	text: {
		/** 17GB — Haupt-Allrounder, 1 GPU */
		"qwen3.5:27b":             { vramGB: 17, gpus: 1 },
		/** 19GB — Eingebautes Reasoning (DeepSeek R1), 1 GPU */
		"deepseek-r1:32b":         { vramGB: 19, gpus: 1 },
		/** 18GB — Code + allgemein, 128k-Kontext, 1 GPU */
		"qwen3-coder-30b-128k:latest": { vramGB: 18, gpus: 1 },
		/** 18GB — Optimierte GPU-Variante des Qwen3-Coders, 1 GPU */
		"qwen3-coder-30b-gpu:latest":  { vramGB: 18, gpus: 1 },
		/** 18GB — GLM-4.7 Flash, chinesisches Modell, 1 GPU */
		"glm-4.7-flash:latest":    { vramGB: 18, gpus: 1 },
		/** 17GB — Gemma4 26B von Google, 1 GPU */
		"gemma4:26b":              { vramGB: 17, gpus: 1 },
		/** 9.6GB — Gemma4 E4B (Effizienz-Variante), 1 GPU */
		"gemma4:e4b":              { vramGB: 9.6, gpus: 1 },
		/** 9.0GB — Qwen2.5 14B Instruct, 1 GPU */
		"qwen2.5:14b-instruct":    { vramGB: 9, gpus: 1 },
		/** 5.2GB — Qwen3 8B, schnell für einfache Tasks */
		"qwen3:8b":                { vramGB: 5.2, gpus: 1 },
		/** 4.9GB — Llama 3.1 8B */
		"llama3.1:8b":             { vramGB: 4.9, gpus: 1 },
		/** 7.1GB — Mistral Nemo */
		"mistral-nemo:latest":     { vramGB: 7.1, gpus: 1 },
	},
	// --- Code ---
	code: {
		/** 9.0GB — Qwen2.5-Coder 14B, 1 GPU */
		"qwen2.5-coder:14b":       { vramGB: 9, gpus: 1 },
		/** 4.7GB — Qwen2.5-Coder 7B, schnell */
		"qwen2.5-coder:7b":        { vramGB: 4.7, gpus: 1 },
	},
	// --- Vision / OCR ---
	vision: {
		/** 5.8GB — OCR-spezialisiert (Chandra OCR 2) */
		"fredrezones55/chandra-ocr-2:patch": { vramGB: 5.8, gpus: 1 },
		/** 6.1GB — Qwen3 Vision-Language Model */
		"qwen3-vl:latest":         { vramGB: 6.1, gpus: 1 },
		/** 6.0GB — Qwen2.5 Vision-Language 7B */
		"qwen2.5vl:7b":            { vramGB: 6, gpus: 1 },
		/** 5.5GB — MiniCPM-V, leichtgewichtig */
		"minicpm-v:latest":        { vramGB: 5.5, gpus: 1 },
		/** 3.3GB — Qwen3-VL 4B, sehr klein */
		"qwen3-vl:4b":             { vramGB: 3.3, gpus: 1 },
	},
	// --- Embedding ---
	embedding: {
		/** 4.7GB — Qwen3 Embedding */
		"qwen3-embedding:latest":  { vramGB: 4.7, gpus: 1 },
	},
} as const;