feat: Pi Text-Agent — initialer Commit (sauberes Repo)

Vollständiges Multi-Agenten-System für Fact-Checking, Artikelschreiben
und Argumentationsanalyse. Zwei Backends: llama.cpp (★ bevorzugt) und Ollama.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dieter Schlüter 2026-05-12 04:21:48 +02:00
commit 5146b7fa30
62 changed files with 11279 additions and 0 deletions

237
lib/ollama.ts Normal file
View file

@ -0,0 +1,237 @@
/**
* lib/ollama.ts
* Zentraler Ollama-Client: Text-Chat und Vision/OCR-Aufrufe.
*
* Neu angelegte Agenten nutzen diesen Client statt inline-fetch.
* Bestehende Agenten (ollama-claim-extractor, verifier) können schrittweise migriert werden.
*
* Konfiguration:
* OLLAMA_HOST Ollama-URL (Standard: http://localhost:11434)
*/
export const OLLAMA_HOST = process.env.OLLAMA_HOST ?? "http://localhost:11434";
export type OllamaMessage = {
role: "system" | "user" | "assistant";
content: string;
images?: string[]; // base64-kodierte Bilder (Vision-Aufrufe)
};
export type OllamaResult = {
text: string;
promptTokens: number;
completionTokens: number;
latencyMs: number;
};
// ---------------------------------------------------------------------------
// Intern
// ---------------------------------------------------------------------------
const MAX_RETRIES = 3;
const RETRY_DELAY_MS = 15_000;
function sleep(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
// ---------------------------------------------------------------------------
// Haupt-Aufruf
// ---------------------------------------------------------------------------
/**
* Generischer Ollama-Chat (Text oder Vision).
* Für Vision: images-Felder in den Messages setzen, oder callOllamaVision() nutzen.
*/
export async function callOllamaChat(
model: string,
messages: OllamaMessage[],
options?: {
/** JSON-Schema für structured output (Ollama >= 0.5) */
format?: "json" | Record<string, unknown>;
temperature?: number;
numCtx?: number;
numPredict?: number;
/**
* Thinking-Mode für qwen3/deepseek-r1-Modelle (Standard: false).
* false /no_think nur Antwort, kein Chain-of-Thought
* true Modell denkt zuerst, Antwort in content; thinking in separatem Feld
*/
think?: boolean;
signal?: AbortSignal;
}
): Promise<OllamaResult> {
const t0 = Date.now();
let lastError: unknown;
// qwen3 und deepseek-r1 haben Thinking-Mode standardmäßig an.
// Für strukturierte Ausgaben (JSON, Extraktion) ist Thinking unerwünscht.
const think = options?.think ?? false;
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
try {
const body: Record<string, unknown> = {
model,
messages,
stream: false,
think,
options: {
temperature: options?.temperature ?? 0.1,
...(options?.numCtx ? { num_ctx: options.numCtx } : {}),
...(options?.numPredict ? { num_predict: options.numPredict } : {}),
},
};
if (options?.format !== undefined) {
body.format = options.format;
}
const resp = await fetch(`${OLLAMA_HOST}/api/chat`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(body),
signal: options?.signal,
});
if (!resp.ok) {
const errText = await resp.text().catch(() => "");
throw new Error(`Ollama HTTP ${resp.status}: ${errText}`);
}
const data = await resp.json() as {
message?: { content?: string; thinking?: string };
prompt_eval_count?: number;
eval_count?: number;
};
// Bei Thinking-Modellen (qwen3, deepseek-r1): wenn content leer,
// Fallback auf thinking-Feld (passiert bei sehr kurzen Antworten).
const text = data.message?.content?.trim()
|| (think ? data.message?.thinking?.trim() : "")
|| "";
return {
text,
promptTokens: data.prompt_eval_count ?? 0,
completionTokens: data.eval_count ?? 0,
latencyMs: Date.now() - t0,
};
} catch (err) {
lastError = err;
if (attempt < MAX_RETRIES) await sleep(RETRY_DELAY_MS);
}
}
throw new Error(
`Ollama fehlgeschlagen nach ${MAX_RETRIES} Versuchen: ${
lastError instanceof Error ? lastError.message : String(lastError)
}`
);
}
// ---------------------------------------------------------------------------
// Vision / OCR
// ---------------------------------------------------------------------------
/**
* Ollama-Aufruf mit Bild-Input (Vision / OCR).
*
* Empfohlene Modelle (passen alle auf RTX 3090 24GB):
* fredrezones55/chandra-ocr-2:patch 5.8GB OCR-spezialisiert, Dokumente/Scans
* qwen3-vl:latest 6.1GB Vision-Language, Bildbeschreibung + OCR
* qwen2.5vl:7b 6.0GB Alternative zu qwen3-vl
* minicpm-v:latest 5.5GB Leichtgewichtig, gut für einfache OCR
*
* @param imageSource Absoluter Dateipfad ("/…") oder base64-String
*/
export async function callOllamaVision(
model: string,
imageSource: string,
prompt: string,
options?: {
systemPrompt?: string;
temperature?: number;
signal?: AbortSignal;
}
): Promise<OllamaResult> {
let imageBase64: string;
if (imageSource.startsWith("/") || imageSource.startsWith("~")) {
const { readFile } = await import("node:fs/promises");
const resolvedPath = imageSource.startsWith("~")
? imageSource.replace(/^~/, process.env.HOME ?? "/root")
: imageSource;
const buf = await readFile(resolvedPath);
imageBase64 = buf.toString("base64");
} else {
imageBase64 = imageSource; // schon base64
}
const messages: OllamaMessage[] = [];
if (options?.systemPrompt) {
messages.push({ role: "system", content: options.systemPrompt });
}
messages.push({ role: "user", content: prompt, images: [imageBase64] });
return callOllamaChat(model, messages, {
temperature: options?.temperature ?? 0.1,
signal: options?.signal,
});
}
// ---------------------------------------------------------------------------
// Modell-Infos (lokal installiert, passend für RTX 3090 24GB)
// ---------------------------------------------------------------------------
/** Alle bekannten lokalen Ollama-Modelle nach Kategorie. */
export const LOCAL_CATALOG = {
// --- Text / Reasoning ---
text: {
/** 17GB — Haupt-Allrounder, 1 GPU */
"qwen3.5:27b": { vramGB: 17, gpus: 1 },
/** 19GB — Eingebautes Reasoning (DeepSeek R1), 1 GPU */
"deepseek-r1:32b": { vramGB: 19, gpus: 1 },
/** 18GB — Code + allgemein, 128k-Kontext, 1 GPU */
"qwen3-coder-30b-128k:latest": { vramGB: 18, gpus: 1 },
/** 18GB — Optimierte GPU-Variante des Qwen3-Coders, 1 GPU */
"qwen3-coder-30b-gpu:latest": { vramGB: 18, gpus: 1 },
/** 18GB — GLM-4.7 Flash, chinesisches Modell, 1 GPU */
"glm-4.7-flash:latest": { vramGB: 18, gpus: 1 },
/** 17GB — Gemma4 26B von Google, 1 GPU */
"gemma4:26b": { vramGB: 17, gpus: 1 },
/** 9.6GB — Gemma4 E4B (Effizienz-Variante), 1 GPU */
"gemma4:e4b": { vramGB: 9.6, gpus: 1 },
/** 9.0GB — Qwen2.5 14B Instruct, 1 GPU */
"qwen2.5:14b-instruct": { vramGB: 9, gpus: 1 },
/** 5.2GB — Qwen3 8B, schnell für einfache Tasks */
"qwen3:8b": { vramGB: 5.2, gpus: 1 },
/** 4.9GB — Llama 3.1 8B */
"llama3.1:8b": { vramGB: 4.9, gpus: 1 },
/** 7.1GB — Mistral Nemo */
"mistral-nemo:latest": { vramGB: 7.1, gpus: 1 },
},
// --- Code ---
code: {
/** 9.0GB — Qwen2.5-Coder 14B, 1 GPU */
"qwen2.5-coder:14b": { vramGB: 9, gpus: 1 },
/** 4.7GB — Qwen2.5-Coder 7B, schnell */
"qwen2.5-coder:7b": { vramGB: 4.7, gpus: 1 },
},
// --- Vision / OCR ---
vision: {
/** 5.8GB — OCR-spezialisiert (Chandra OCR 2) */
"fredrezones55/chandra-ocr-2:patch": { vramGB: 5.8, gpus: 1 },
/** 6.1GB — Qwen3 Vision-Language Model */
"qwen3-vl:latest": { vramGB: 6.1, gpus: 1 },
/** 6.0GB — Qwen2.5 Vision-Language 7B */
"qwen2.5vl:7b": { vramGB: 6, gpus: 1 },
/** 5.5GB — MiniCPM-V, leichtgewichtig */
"minicpm-v:latest": { vramGB: 5.5, gpus: 1 },
/** 3.3GB — Qwen3-VL 4B, sehr klein */
"qwen3-vl:4b": { vramGB: 3.3, gpus: 1 },
},
// --- Embedding ---
embedding: {
/** 4.7GB — Qwen3 Embedding */
"qwen3-embedding:latest": { vramGB: 4.7, gpus: 1 },
},
} as const;