feat: Pi Text-Agent — initialer Commit (sauberes Repo)
Vollständiges Multi-Agenten-System für Fact-Checking, Artikelschreiben und Argumentationsanalyse. Zwei Backends: llama.cpp (★ bevorzugt) und Ollama. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
5146b7fa30
62 changed files with 11279 additions and 0 deletions
237
lib/ollama.ts
Normal file
237
lib/ollama.ts
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
/**
|
||||
* lib/ollama.ts
|
||||
* Zentraler Ollama-Client: Text-Chat und Vision/OCR-Aufrufe.
|
||||
*
|
||||
* Neu angelegte Agenten nutzen diesen Client statt inline-fetch.
|
||||
* Bestehende Agenten (ollama-claim-extractor, verifier) können schrittweise migriert werden.
|
||||
*
|
||||
* Konfiguration:
|
||||
* OLLAMA_HOST → Ollama-URL (Standard: http://localhost:11434)
|
||||
*/
|
||||
|
||||
export const OLLAMA_HOST = process.env.OLLAMA_HOST ?? "http://localhost:11434";
|
||||
|
||||
export type OllamaMessage = {
|
||||
role: "system" | "user" | "assistant";
|
||||
content: string;
|
||||
images?: string[]; // base64-kodierte Bilder (Vision-Aufrufe)
|
||||
};
|
||||
|
||||
export type OllamaResult = {
|
||||
text: string;
|
||||
promptTokens: number;
|
||||
completionTokens: number;
|
||||
latencyMs: number;
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Intern
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const MAX_RETRIES = 3;
|
||||
const RETRY_DELAY_MS = 15_000;
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Haupt-Aufruf
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Generischer Ollama-Chat (Text oder Vision).
|
||||
* Für Vision: images-Felder in den Messages setzen, oder callOllamaVision() nutzen.
|
||||
*/
|
||||
export async function callOllamaChat(
|
||||
model: string,
|
||||
messages: OllamaMessage[],
|
||||
options?: {
|
||||
/** JSON-Schema für structured output (Ollama >= 0.5) */
|
||||
format?: "json" | Record<string, unknown>;
|
||||
temperature?: number;
|
||||
numCtx?: number;
|
||||
numPredict?: number;
|
||||
/**
|
||||
* Thinking-Mode für qwen3/deepseek-r1-Modelle (Standard: false).
|
||||
* false → /no_think → nur Antwort, kein Chain-of-Thought
|
||||
* true → Modell denkt zuerst, Antwort in content; thinking in separatem Feld
|
||||
*/
|
||||
think?: boolean;
|
||||
signal?: AbortSignal;
|
||||
}
|
||||
): Promise<OllamaResult> {
|
||||
const t0 = Date.now();
|
||||
let lastError: unknown;
|
||||
|
||||
// qwen3 und deepseek-r1 haben Thinking-Mode standardmäßig an.
|
||||
// Für strukturierte Ausgaben (JSON, Extraktion) ist Thinking unerwünscht.
|
||||
const think = options?.think ?? false;
|
||||
|
||||
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
||||
try {
|
||||
const body: Record<string, unknown> = {
|
||||
model,
|
||||
messages,
|
||||
stream: false,
|
||||
think,
|
||||
options: {
|
||||
temperature: options?.temperature ?? 0.1,
|
||||
...(options?.numCtx ? { num_ctx: options.numCtx } : {}),
|
||||
...(options?.numPredict ? { num_predict: options.numPredict } : {}),
|
||||
},
|
||||
};
|
||||
if (options?.format !== undefined) {
|
||||
body.format = options.format;
|
||||
}
|
||||
|
||||
const resp = await fetch(`${OLLAMA_HOST}/api/chat`, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(body),
|
||||
signal: options?.signal,
|
||||
});
|
||||
|
||||
if (!resp.ok) {
|
||||
const errText = await resp.text().catch(() => "");
|
||||
throw new Error(`Ollama HTTP ${resp.status}: ${errText}`);
|
||||
}
|
||||
|
||||
const data = await resp.json() as {
|
||||
message?: { content?: string; thinking?: string };
|
||||
prompt_eval_count?: number;
|
||||
eval_count?: number;
|
||||
};
|
||||
|
||||
// Bei Thinking-Modellen (qwen3, deepseek-r1): wenn content leer,
|
||||
// Fallback auf thinking-Feld (passiert bei sehr kurzen Antworten).
|
||||
const text = data.message?.content?.trim()
|
||||
|| (think ? data.message?.thinking?.trim() : "")
|
||||
|| "";
|
||||
|
||||
return {
|
||||
text,
|
||||
promptTokens: data.prompt_eval_count ?? 0,
|
||||
completionTokens: data.eval_count ?? 0,
|
||||
latencyMs: Date.now() - t0,
|
||||
};
|
||||
} catch (err) {
|
||||
lastError = err;
|
||||
if (attempt < MAX_RETRIES) await sleep(RETRY_DELAY_MS);
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(
|
||||
`Ollama fehlgeschlagen nach ${MAX_RETRIES} Versuchen: ${
|
||||
lastError instanceof Error ? lastError.message : String(lastError)
|
||||
}`
|
||||
);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Vision / OCR
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Ollama-Aufruf mit Bild-Input (Vision / OCR).
|
||||
*
|
||||
* Empfohlene Modelle (passen alle auf RTX 3090 24GB):
|
||||
* fredrezones55/chandra-ocr-2:patch 5.8GB — OCR-spezialisiert, Dokumente/Scans
|
||||
* qwen3-vl:latest 6.1GB — Vision-Language, Bildbeschreibung + OCR
|
||||
* qwen2.5vl:7b 6.0GB — Alternative zu qwen3-vl
|
||||
* minicpm-v:latest 5.5GB — Leichtgewichtig, gut für einfache OCR
|
||||
*
|
||||
* @param imageSource Absoluter Dateipfad ("/…") oder base64-String
|
||||
*/
|
||||
export async function callOllamaVision(
|
||||
model: string,
|
||||
imageSource: string,
|
||||
prompt: string,
|
||||
options?: {
|
||||
systemPrompt?: string;
|
||||
temperature?: number;
|
||||
signal?: AbortSignal;
|
||||
}
|
||||
): Promise<OllamaResult> {
|
||||
let imageBase64: string;
|
||||
|
||||
if (imageSource.startsWith("/") || imageSource.startsWith("~")) {
|
||||
const { readFile } = await import("node:fs/promises");
|
||||
const resolvedPath = imageSource.startsWith("~")
|
||||
? imageSource.replace(/^~/, process.env.HOME ?? "/root")
|
||||
: imageSource;
|
||||
const buf = await readFile(resolvedPath);
|
||||
imageBase64 = buf.toString("base64");
|
||||
} else {
|
||||
imageBase64 = imageSource; // schon base64
|
||||
}
|
||||
|
||||
const messages: OllamaMessage[] = [];
|
||||
if (options?.systemPrompt) {
|
||||
messages.push({ role: "system", content: options.systemPrompt });
|
||||
}
|
||||
messages.push({ role: "user", content: prompt, images: [imageBase64] });
|
||||
|
||||
return callOllamaChat(model, messages, {
|
||||
temperature: options?.temperature ?? 0.1,
|
||||
signal: options?.signal,
|
||||
});
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Modell-Infos (lokal installiert, passend für RTX 3090 24GB)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/** Alle bekannten lokalen Ollama-Modelle nach Kategorie. */
|
||||
export const LOCAL_CATALOG = {
|
||||
// --- Text / Reasoning ---
|
||||
text: {
|
||||
/** 17GB — Haupt-Allrounder, 1 GPU */
|
||||
"qwen3.5:27b": { vramGB: 17, gpus: 1 },
|
||||
/** 19GB — Eingebautes Reasoning (DeepSeek R1), 1 GPU */
|
||||
"deepseek-r1:32b": { vramGB: 19, gpus: 1 },
|
||||
/** 18GB — Code + allgemein, 128k-Kontext, 1 GPU */
|
||||
"qwen3-coder-30b-128k:latest": { vramGB: 18, gpus: 1 },
|
||||
/** 18GB — Optimierte GPU-Variante des Qwen3-Coders, 1 GPU */
|
||||
"qwen3-coder-30b-gpu:latest": { vramGB: 18, gpus: 1 },
|
||||
/** 18GB — GLM-4.7 Flash, chinesisches Modell, 1 GPU */
|
||||
"glm-4.7-flash:latest": { vramGB: 18, gpus: 1 },
|
||||
/** 17GB — Gemma4 26B von Google, 1 GPU */
|
||||
"gemma4:26b": { vramGB: 17, gpus: 1 },
|
||||
/** 9.6GB — Gemma4 E4B (Effizienz-Variante), 1 GPU */
|
||||
"gemma4:e4b": { vramGB: 9.6, gpus: 1 },
|
||||
/** 9.0GB — Qwen2.5 14B Instruct, 1 GPU */
|
||||
"qwen2.5:14b-instruct": { vramGB: 9, gpus: 1 },
|
||||
/** 5.2GB — Qwen3 8B, schnell für einfache Tasks */
|
||||
"qwen3:8b": { vramGB: 5.2, gpus: 1 },
|
||||
/** 4.9GB — Llama 3.1 8B */
|
||||
"llama3.1:8b": { vramGB: 4.9, gpus: 1 },
|
||||
/** 7.1GB — Mistral Nemo */
|
||||
"mistral-nemo:latest": { vramGB: 7.1, gpus: 1 },
|
||||
},
|
||||
// --- Code ---
|
||||
code: {
|
||||
/** 9.0GB — Qwen2.5-Coder 14B, 1 GPU */
|
||||
"qwen2.5-coder:14b": { vramGB: 9, gpus: 1 },
|
||||
/** 4.7GB — Qwen2.5-Coder 7B, schnell */
|
||||
"qwen2.5-coder:7b": { vramGB: 4.7, gpus: 1 },
|
||||
},
|
||||
// --- Vision / OCR ---
|
||||
vision: {
|
||||
/** 5.8GB — OCR-spezialisiert (Chandra OCR 2) */
|
||||
"fredrezones55/chandra-ocr-2:patch": { vramGB: 5.8, gpus: 1 },
|
||||
/** 6.1GB — Qwen3 Vision-Language Model */
|
||||
"qwen3-vl:latest": { vramGB: 6.1, gpus: 1 },
|
||||
/** 6.0GB — Qwen2.5 Vision-Language 7B */
|
||||
"qwen2.5vl:7b": { vramGB: 6, gpus: 1 },
|
||||
/** 5.5GB — MiniCPM-V, leichtgewichtig */
|
||||
"minicpm-v:latest": { vramGB: 5.5, gpus: 1 },
|
||||
/** 3.3GB — Qwen3-VL 4B, sehr klein */
|
||||
"qwen3-vl:4b": { vramGB: 3.3, gpus: 1 },
|
||||
},
|
||||
// --- Embedding ---
|
||||
embedding: {
|
||||
/** 4.7GB — Qwen3 Embedding */
|
||||
"qwen3-embedding:latest": { vramGB: 4.7, gpus: 1 },
|
||||
},
|
||||
} as const;
|
||||
Loading…
Add table
Add a link
Reference in a new issue