697 lines
23 KiB
TypeScript
697 lines
23 KiB
TypeScript
|
|
/**
|
|||
|
|
* ollama-claim-extractor.ts
|
|||
|
|
* Pi-Extension + CLI: Einzelbehauptungen aus Texten extrahieren via lokalem Ollama
|
|||
|
|
*
|
|||
|
|
* Als Pi-Extension: ~/.pi/agent/extensions/fact-checker/ollama-claim-extractor.ts
|
|||
|
|
* Nach Änderungen in Pi: /reload
|
|||
|
|
*
|
|||
|
|
* Als CLI:
|
|||
|
|
* npx tsx agenten/ollama-claim-extractor.ts "Textinhalt..."
|
|||
|
|
* npx tsx agenten/ollama-claim-extractor.ts --only-checkable "Textinhalt..."
|
|||
|
|
* npx tsx agenten/ollama-claim-extractor.ts --model qwen3.5:27b "Textinhalt..."
|
|||
|
|
* npx tsx agenten/ollama-claim-extractor.ts --json "Textinhalt..." (nur JSON-Ausgabe)
|
|||
|
|
*
|
|||
|
|
* Modell-Empfehlung: qwen3.5:9b (6.6GB, 1 GPU, fast gleiche Präzision wie 27B, 2× schneller)
|
|||
|
|
*/
|
|||
|
|
|
|||
|
|
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|||
|
|
import { Type } from "@sinclair/typebox";
|
|||
|
|
import { fileURLToPath } from "node:url";
|
|||
|
|
import { createLogger, nullLogger, type Logger } from "../lib/logger.js";
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// Typen
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
export type ClaimType = "fact" | "causal" | "statistical" | "quote" | "prediction" | "opinion";
|
|||
|
|
export type Checkability = "checkable" | "partly_checkable" | "not_checkable";
|
|||
|
|
|
|||
|
|
export type Claim = {
|
|||
|
|
claim_id: string;
|
|||
|
|
text: string;
|
|||
|
|
claim_type: ClaimType;
|
|||
|
|
checkability: Checkability;
|
|||
|
|
needs_citation: boolean;
|
|||
|
|
entities: string[];
|
|||
|
|
time_scope: string | null;
|
|||
|
|
source_sentence: string;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
export type ClaimSet = {
|
|||
|
|
schema_version: "1.0.0";
|
|||
|
|
text_language: string;
|
|||
|
|
extraction_notes: string;
|
|||
|
|
total_claims: number;
|
|||
|
|
claims: Claim[];
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
type OllamaResponse = {
|
|||
|
|
message?: { content?: string };
|
|||
|
|
done?: boolean;
|
|||
|
|
eval_count?: number;
|
|||
|
|
prompt_eval_count?: number;
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// Konfiguration
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
const DEFAULT_MODEL = "qwen3.5:9b";
|
|||
|
|
const OLLAMA_HOST = process.env.OLLAMA_HOST ?? "http://localhost:11434";
|
|||
|
|
const DEFAULT_MAX_CLAIMS = 40;
|
|||
|
|
const TEMPERATURE = 0.1;
|
|||
|
|
const NUM_CTX = 8192;
|
|||
|
|
|
|||
|
|
// Texte über diesem Schwellenwert werden in Chunks aufgeteilt (Zeichen)
|
|||
|
|
// 8192 Tokens Kontext: ~3000 Zeichen Input + ~1000 Prompt-Overhead + ~3200 Tokens Output (40 Claims)
|
|||
|
|
const CHUNK_THRESHOLD = 4000;
|
|||
|
|
const CHUNK_SIZE = 3000;
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// JSON-Schema für Ollama structured output
|
|||
|
|
// (Teilmenge von claim.schema.json — ohne Pattern-Constraint, da Ollama
|
|||
|
|
// reguläre Ausdrücke im format-Parameter nicht immer unterstützt)
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
export const CLAIM_OLLAMA_SCHEMA = {
|
|||
|
|
type: "object",
|
|||
|
|
additionalProperties: false,
|
|||
|
|
properties: {
|
|||
|
|
schema_version: { type: "string" },
|
|||
|
|
text_language: { type: "string" },
|
|||
|
|
extraction_notes: { type: "string" },
|
|||
|
|
total_claims: { type: "integer" },
|
|||
|
|
claims: {
|
|||
|
|
type: "array",
|
|||
|
|
items: {
|
|||
|
|
type: "object",
|
|||
|
|
additionalProperties: false,
|
|||
|
|
properties: {
|
|||
|
|
claim_id: { type: "string" },
|
|||
|
|
text: { type: "string" },
|
|||
|
|
claim_type: {
|
|||
|
|
type: "string",
|
|||
|
|
enum: ["fact", "causal", "statistical", "quote", "prediction", "opinion"],
|
|||
|
|
},
|
|||
|
|
checkability: {
|
|||
|
|
type: "string",
|
|||
|
|
enum: ["checkable", "partly_checkable", "not_checkable"],
|
|||
|
|
},
|
|||
|
|
needs_citation: { type: "boolean" },
|
|||
|
|
entities: { type: "array", items: { type: "string" } },
|
|||
|
|
time_scope: { type: ["string", "null"] },
|
|||
|
|
source_sentence: { type: "string" },
|
|||
|
|
},
|
|||
|
|
required: [
|
|||
|
|
"claim_id",
|
|||
|
|
"text",
|
|||
|
|
"claim_type",
|
|||
|
|
"checkability",
|
|||
|
|
"needs_citation",
|
|||
|
|
"entities",
|
|||
|
|
"time_scope",
|
|||
|
|
"source_sentence",
|
|||
|
|
],
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
required: ["schema_version", "text_language", "extraction_notes", "total_claims", "claims"],
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// System-Prompt
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
function buildSystemPrompt(maxClaims: number): string {
|
|||
|
|
return `Du bist ein Experte für Faktenextraktion und Fact-Checking-Vorbereitung.
|
|||
|
|
|
|||
|
|
Deine Aufgabe: Analysiere den Text und extrahiere alle Behauptungen als diskrete, einzeln prüfbare Einheiten.
|
|||
|
|
Extrahiere maximal ${maxClaims} Behauptungen. Bei sehr langen Texten priorisiere die wichtigsten und prüfbarsten.
|
|||
|
|
|
|||
|
|
REGELN für die Extraktion:
|
|||
|
|
- Formuliere jede Behauptung als eigenständigen, vollständigen Satz (nicht als Fragment)
|
|||
|
|
- Behalte den Sinn der Originalformulierung bei, mache Behauptungen aber selbstständig lesbar
|
|||
|
|
- claim_id: fortlaufend "c001", "c002", "c003", ...
|
|||
|
|
|
|||
|
|
CLAIM TYPES:
|
|||
|
|
- fact: Konkrete Tatsachenbehauptung ("X ist Y", "X hat Z getan")
|
|||
|
|
- causal: Kausalbehauptung ("X hat zu Y geführt", "wegen X passiert Y")
|
|||
|
|
- statistical: Zahlen, Prozentwerte, Statistiken, Rankings
|
|||
|
|
- quote: Wörtliches oder indirektes Zitat einer Person
|
|||
|
|
- prediction: Prognose, Vorhersage, Erwartung über Zukunftsereignisse
|
|||
|
|
- opinion: Wertung, Meinung, normative Aussage (gut/schlecht/sollte)
|
|||
|
|
|
|||
|
|
CHECKABILITY:
|
|||
|
|
- checkable: Empirisch überprüfbar durch Primärquellen, Datenbanken, offizielle Stellen
|
|||
|
|
- partly_checkable: Nur teilweise prüfbar (z.B. enthält sowohl Fakt als auch Wertung)
|
|||
|
|
- not_checkable: Reine Meinung, reine Prognose, Werturteil ohne Tatsachenkern
|
|||
|
|
|
|||
|
|
NEEDS_CITATION: true wenn Zahlen, spezifische Fakten, Zitate oder Studienergebnisse vorhanden
|
|||
|
|
|
|||
|
|
ENTITIES: Alle benannten Entitäten: Personen, Organisationen, Länder, Institutionen, Produkte, konkrete Daten
|
|||
|
|
|
|||
|
|
TIME_SCOPE: Zeitrahmen wenn angegeben (z.B. "2024", "Q1 2025", "seit 1990"), sonst null
|
|||
|
|
|
|||
|
|
SOURCE_SENTENCE: Der originale Satz aus dem Quelltext (wörtlich, max. 200 Zeichen)
|
|||
|
|
|
|||
|
|
DUPLIKATE: Extrahiere jeden Sachverhalt nur einmal. Wenn derselbe Fakt im Text mehrfach vorkommt (z.B. als Einleitung und später als Detail), erstelle nur einen Claim dafür.
|
|||
|
|
|
|||
|
|
Antworte NUR mit dem JSON-Objekt gemäß Schema. Kein Freitext davor oder danach.`;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// Text-Chunking für lange Texte
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Teilt langen Text an Absatzgrenzen in Stücke von max. CHUNK_SIZE Zeichen.
|
|||
|
|
* Absätze werden nicht aufgetrennt — bei Absätzen > CHUNK_SIZE werden sie allein übergeben.
|
|||
|
|
*/
|
|||
|
|
function splitIntoChunks(text: string): string[] {
|
|||
|
|
const paragraphs = text.split(/\n\n+/).filter((p) => p.trim().length > 0);
|
|||
|
|
const chunks: string[] = [];
|
|||
|
|
let current = "";
|
|||
|
|
|
|||
|
|
for (const para of paragraphs) {
|
|||
|
|
if (current.length + para.length + 2 > CHUNK_SIZE && current.length > 0) {
|
|||
|
|
chunks.push(current.trim());
|
|||
|
|
current = para;
|
|||
|
|
} else {
|
|||
|
|
current = current ? current + "\n\n" + para : para;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
if (current.trim()) chunks.push(current.trim());
|
|||
|
|
return chunks;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
/**
|
|||
|
|
* Entfernt doppelte Claims (gleicher text-Inhalt nach Normalisierung).
|
|||
|
|
*/
|
|||
|
|
function deduplicateClaims(claims: Claim[]): Claim[] {
|
|||
|
|
const seen = new Set<string>();
|
|||
|
|
return claims.filter((c) => {
|
|||
|
|
const key = c.text.toLowerCase().replace(/\s+/g, " ").trim();
|
|||
|
|
if (seen.has(key)) return false;
|
|||
|
|
seen.add(key);
|
|||
|
|
return true;
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// Ollama-Aufruf
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
export async function callOllamaClaimExtract(
|
|||
|
|
text: string,
|
|||
|
|
model: string,
|
|||
|
|
maxClaims: number,
|
|||
|
|
signal?: AbortSignal,
|
|||
|
|
logger?: Logger
|
|||
|
|
): Promise<{ claimSet: ClaimSet; tokensIn: number; tokensOut: number; latencyMs: number }> {
|
|||
|
|
const log = logger ?? nullLogger;
|
|||
|
|
// Langen Text in Chunks aufteilen
|
|||
|
|
if (text.length > CHUNK_THRESHOLD) {
|
|||
|
|
log.info("Text zu lang für Single-Pass — Chunking aktiv", { textLength: text.length, threshold: CHUNK_THRESHOLD });
|
|||
|
|
return callOllamaClaimExtractChunked(text, model, maxClaims, signal, log);
|
|||
|
|
}
|
|||
|
|
log.debug("Single-Pass Extraktion", { textLength: text.length, model, maxClaims });
|
|||
|
|
return callOllamaClaimExtractSingle(text, model, maxClaims, signal, log);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
async function callOllamaClaimExtractChunked(
|
|||
|
|
text: string,
|
|||
|
|
model: string,
|
|||
|
|
maxClaims: number,
|
|||
|
|
signal?: AbortSignal,
|
|||
|
|
logger?: Logger
|
|||
|
|
): Promise<{ claimSet: ClaimSet; tokensIn: number; tokensOut: number; latencyMs: number }> {
|
|||
|
|
const log = logger ?? nullLogger;
|
|||
|
|
const t0 = Date.now();
|
|||
|
|
const chunks = splitIntoChunks(text);
|
|||
|
|
const claimsPerChunk = Math.ceil(maxClaims / chunks.length);
|
|||
|
|
|
|||
|
|
log.info(`Text in ${chunks.length} Chunks aufgeteilt`, {
|
|||
|
|
chunks: chunks.length,
|
|||
|
|
claimsPerChunk,
|
|||
|
|
chunkLengths: chunks.map((c) => c.length),
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
let totalIn = 0;
|
|||
|
|
let totalOut = 0;
|
|||
|
|
const allClaims: Claim[] = [];
|
|||
|
|
let language = "de";
|
|||
|
|
const notes: string[] = [];
|
|||
|
|
|
|||
|
|
for (let i = 0; i < chunks.length; i++) {
|
|||
|
|
log.info(`Chunk ${i + 1}/${chunks.length} extrahieren...`, { chunkLength: chunks[i].length, claimsPerChunk });
|
|||
|
|
const result = await callOllamaClaimExtractSingle(chunks[i], model, claimsPerChunk, signal, log);
|
|||
|
|
log.info(`Chunk ${i + 1}/${chunks.length} fertig`, {
|
|||
|
|
claims: result.claimSet.claims.length,
|
|||
|
|
tokensIn: result.tokensIn,
|
|||
|
|
tokensOut: result.tokensOut,
|
|||
|
|
latencyMs: result.latencyMs,
|
|||
|
|
});
|
|||
|
|
allClaims.push(...result.claimSet.claims);
|
|||
|
|
totalIn += result.tokensIn;
|
|||
|
|
totalOut += result.tokensOut;
|
|||
|
|
language = result.claimSet.text_language;
|
|||
|
|
if (result.claimSet.extraction_notes) notes.push(result.claimSet.extraction_notes);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Deduplizieren und neu nummerieren
|
|||
|
|
const beforeDedup = allClaims.length;
|
|||
|
|
const unique = deduplicateClaims(allClaims).slice(0, maxClaims);
|
|||
|
|
const renumbered: Claim[] = unique.map((c, i) => ({
|
|||
|
|
...c,
|
|||
|
|
claim_id: `c${String(i + 1).padStart(3, "0")}`,
|
|||
|
|
}));
|
|||
|
|
|
|||
|
|
log.info("Chunking abgeschlossen", {
|
|||
|
|
totalBeforeDedup: beforeDedup,
|
|||
|
|
afterDedup: renumbered.length,
|
|||
|
|
totalTokensIn: totalIn,
|
|||
|
|
totalTokensOut: totalOut,
|
|||
|
|
totalLatencyMs: Date.now() - t0,
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
claimSet: {
|
|||
|
|
schema_version: "1.0.0",
|
|||
|
|
text_language: language,
|
|||
|
|
extraction_notes: `Text in ${chunks.length} Abschnitte aufgeteilt. ${notes.filter(Boolean).join(" ")}`,
|
|||
|
|
total_claims: renumbered.length,
|
|||
|
|
claims: renumbered,
|
|||
|
|
},
|
|||
|
|
tokensIn: totalIn,
|
|||
|
|
tokensOut: totalOut,
|
|||
|
|
latencyMs: Date.now() - t0,
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
async function callOllamaClaimExtractSingle(
|
|||
|
|
text: string,
|
|||
|
|
model: string,
|
|||
|
|
maxClaims: number,
|
|||
|
|
signal?: AbortSignal,
|
|||
|
|
logger?: Logger
|
|||
|
|
): Promise<{ claimSet: ClaimSet; tokensIn: number; tokensOut: number; latencyMs: number }> {
|
|||
|
|
const log = logger ?? nullLogger;
|
|||
|
|
const t0 = Date.now();
|
|||
|
|
|
|||
|
|
const body = {
|
|||
|
|
model,
|
|||
|
|
messages: [
|
|||
|
|
{
|
|||
|
|
role: "system",
|
|||
|
|
content: buildSystemPrompt(maxClaims),
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
role: "user",
|
|||
|
|
content: `Extrahiere alle Behauptungen aus folgendem Text:\n\n---\n${text}\n---`,
|
|||
|
|
},
|
|||
|
|
],
|
|||
|
|
format: CLAIM_OLLAMA_SCHEMA,
|
|||
|
|
stream: false,
|
|||
|
|
options: {
|
|||
|
|
temperature: TEMPERATURE,
|
|||
|
|
num_ctx: NUM_CTX,
|
|||
|
|
},
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
log.debug("Ollama-Aufruf gestartet", { model, textLength: text.length, num_ctx: NUM_CTX });
|
|||
|
|
|
|||
|
|
// Retry bei temporären Verbindungsfehlern (Ollama startet kurz neu oder ist kurz ausgelastet)
|
|||
|
|
const MAX_RETRIES = 3;
|
|||
|
|
const RETRY_DELAY_MS = 15_000; // 15s Pause vor Retry
|
|||
|
|
let lastError: unknown;
|
|||
|
|
let resp: Response | null = null;
|
|||
|
|
|
|||
|
|
for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
|
|||
|
|
try {
|
|||
|
|
resp = await fetch(`${OLLAMA_HOST}/api/chat`, {
|
|||
|
|
method: "POST",
|
|||
|
|
headers: { "Content-Type": "application/json" },
|
|||
|
|
body: JSON.stringify(body),
|
|||
|
|
signal,
|
|||
|
|
});
|
|||
|
|
break; // Verbindung erfolgreich
|
|||
|
|
} catch (err) {
|
|||
|
|
lastError = err;
|
|||
|
|
const isLast = attempt === MAX_RETRIES;
|
|||
|
|
log.warn(`Ollama fetch fehlgeschlagen (Versuch ${attempt}/${MAX_RETRIES})`, {
|
|||
|
|
error: err instanceof Error ? err.message : String(err),
|
|||
|
|
retryInMs: isLast ? 0 : RETRY_DELAY_MS,
|
|||
|
|
});
|
|||
|
|
if (isLast) throw new Error(`fetch failed nach ${MAX_RETRIES} Versuchen: ${err instanceof Error ? err.message : err}`);
|
|||
|
|
// Warten bevor Retry — Ollama könnte kurz neu starten
|
|||
|
|
await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!resp!.ok) {
|
|||
|
|
const errorText = await resp!.text().catch(() => "");
|
|||
|
|
log.error("Ollama API Fehler", { status: resp!.status, body: errorText.slice(0, 200) });
|
|||
|
|
throw new Error(`Ollama API Fehler ${resp!.status}: ${errorText}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const data = (await resp!.json()) as OllamaResponse;
|
|||
|
|
const raw = data.message?.content ?? "";
|
|||
|
|
|
|||
|
|
log.debug("Ollama-Antwort empfangen", {
|
|||
|
|
promptTokens: data.prompt_eval_count,
|
|||
|
|
outputTokens: data.eval_count,
|
|||
|
|
rawLength: raw.length,
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
if (!raw.trim()) {
|
|||
|
|
log.error("Leere Ollama-Antwort", { promptTokens: data.prompt_eval_count });
|
|||
|
|
throw new Error("Leere Antwort von Ollama erhalten");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
let parsed: unknown;
|
|||
|
|
try {
|
|||
|
|
parsed = JSON.parse(raw);
|
|||
|
|
} catch {
|
|||
|
|
log.error("JSON-Parse-Fehler", { rawPreview: raw.slice(0, 200) });
|
|||
|
|
throw new Error(`Ollama-Ausgabe ist kein gültiges JSON: ${raw.slice(0, 200)}`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Grundlegende Strukturprüfung (kein vollständiger Schema-Validator)
|
|||
|
|
const p = parsed as Record<string, unknown>;
|
|||
|
|
if (!Array.isArray(p.claims)) {
|
|||
|
|
log.error("Ungültige Struktur: claims fehlt", { keys: Object.keys(p) });
|
|||
|
|
throw new Error(`Ungültige Struktur: 'claims' fehlt oder ist kein Array`);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if ((p.claims as unknown[]).length === 0) {
|
|||
|
|
// Leere Claims deuten auf Kontext-Overflow oder Modell-Fehler hin
|
|||
|
|
const usedCtx = data.prompt_eval_count ?? 0;
|
|||
|
|
log.warn("0 Claims extrahiert", { promptTokens: usedCtx, num_ctx: NUM_CTX, textLength: text.length });
|
|||
|
|
throw new Error(
|
|||
|
|
`Ollama hat 0 Claims extrahiert (prompt_tokens=${usedCtx}). ` +
|
|||
|
|
`Text zu lang für num_ctx=${NUM_CTX} oder Modell-Fehler.`
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const claimSet: ClaimSet = {
|
|||
|
|
schema_version: "1.0.0",
|
|||
|
|
text_language: typeof p.text_language === "string" ? p.text_language : "unknown",
|
|||
|
|
extraction_notes: typeof p.extraction_notes === "string" ? p.extraction_notes : "",
|
|||
|
|
total_claims: typeof p.total_claims === "number" ? p.total_claims : (p.claims as unknown[]).length,
|
|||
|
|
claims: p.claims as Claim[],
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
claimSet,
|
|||
|
|
tokensIn: data.prompt_eval_count ?? 0,
|
|||
|
|
tokensOut: data.eval_count ?? 0,
|
|||
|
|
latencyMs: Date.now() - t0,
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// Formatierung (Pi-Ausgabe + CLI-Ausgabe)
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
const TYPE_LABEL: Record<ClaimType, string> = {
|
|||
|
|
fact: "FAKT",
|
|||
|
|
causal: "KAUSAL",
|
|||
|
|
statistical: "STATISTIK",
|
|||
|
|
quote: "ZITAT",
|
|||
|
|
prediction: "PROGNOSE",
|
|||
|
|
opinion: "MEINUNG",
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
const CHECK_ICON: Record<Checkability, string> = {
|
|||
|
|
checkable: "✓",
|
|||
|
|
partly_checkable: "~",
|
|||
|
|
not_checkable: "✗",
|
|||
|
|
};
|
|||
|
|
|
|||
|
|
function formatClaimSet(
|
|||
|
|
claimSet: ClaimSet,
|
|||
|
|
onlyCheckable: boolean,
|
|||
|
|
model: string,
|
|||
|
|
tokensIn: number,
|
|||
|
|
tokensOut: number,
|
|||
|
|
latencyMs: number
|
|||
|
|
): string {
|
|||
|
|
const filtered = onlyCheckable
|
|||
|
|
? claimSet.claims.filter((c) => c.checkability === "checkable")
|
|||
|
|
: claimSet.claims;
|
|||
|
|
|
|||
|
|
const checkable = filtered.filter((c) => c.checkability === "checkable");
|
|||
|
|
const partlyCheckable = filtered.filter((c) => c.checkability === "partly_checkable");
|
|||
|
|
const notCheckable = filtered.filter((c) => c.checkability === "not_checkable");
|
|||
|
|
|
|||
|
|
const lines: string[] = [];
|
|||
|
|
|
|||
|
|
lines.push(
|
|||
|
|
`## Claim-Extraktion: ${claimSet.total_claims} Behauptung${claimSet.total_claims !== 1 ? "en" : ""} gefunden` +
|
|||
|
|
(onlyCheckable && filtered.length < claimSet.total_claims
|
|||
|
|
? ` (${filtered.length} prüfbar angezeigt)`
|
|||
|
|
: "")
|
|||
|
|
);
|
|||
|
|
lines.push(`Sprache: ${claimSet.text_language}`);
|
|||
|
|
if (claimSet.extraction_notes) {
|
|||
|
|
lines.push(`Hinweis: ${claimSet.extraction_notes}`);
|
|||
|
|
}
|
|||
|
|
lines.push("");
|
|||
|
|
|
|||
|
|
function renderClaims(claims: Claim[], sectionTitle: string) {
|
|||
|
|
if (claims.length === 0) return;
|
|||
|
|
lines.push(`**${sectionTitle} (${claims.length}):**`);
|
|||
|
|
for (const c of claims) {
|
|||
|
|
const icon = CHECK_ICON[c.checkability];
|
|||
|
|
const type = TYPE_LABEL[c.claim_type];
|
|||
|
|
lines.push(`\`${c.claim_id}\` ${icon} [${type}] ${c.text}`);
|
|||
|
|
|
|||
|
|
const meta: string[] = [];
|
|||
|
|
if (c.entities.length > 0) meta.push(`Entitäten: ${c.entities.join(", ")}`);
|
|||
|
|
if (c.time_scope) meta.push(`Zeit: ${c.time_scope}`);
|
|||
|
|
if (c.needs_citation) meta.push(`Zitat nötig: ja`);
|
|||
|
|
if (meta.length > 0) {
|
|||
|
|
lines.push(` ${meta.join(" | ")}`);
|
|||
|
|
}
|
|||
|
|
lines.push("");
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
renderClaims(checkable, "✓ Prüfbar");
|
|||
|
|
if (!onlyCheckable) {
|
|||
|
|
renderClaims(partlyCheckable, "~ Teilweise prüfbar");
|
|||
|
|
renderClaims(notCheckable, "✗ Nicht prüfbar");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const latSec = (latencyMs / 1000).toFixed(1);
|
|||
|
|
const tokenInfo =
|
|||
|
|
tokensIn || tokensOut ? ` · ${tokensIn}+${tokensOut} Tokens` : "";
|
|||
|
|
lines.push(`_[Ollama: ${model}${tokenInfo} · ${latSec}s]_`);
|
|||
|
|
|
|||
|
|
return lines.join("\n");
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// Pi-Extension-Parameters (TypeBox)
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
const PARAMS = Type.Object({
|
|||
|
|
text: Type.String({
|
|||
|
|
description:
|
|||
|
|
"Der zu analysierende Text. Kann ein Artikel, Blogeintrag, Nachrichtentext oder beliebiger Fließtext sein.",
|
|||
|
|
}),
|
|||
|
|
onlyCheckable: Type.Optional(
|
|||
|
|
Type.Boolean({
|
|||
|
|
description:
|
|||
|
|
"Wenn true: nur empirisch prüfbare Claims ausgeben (checkable). Standard: false.",
|
|||
|
|
})
|
|||
|
|
),
|
|||
|
|
maxClaims: Type.Optional(
|
|||
|
|
Type.Number({
|
|||
|
|
description: `Maximale Anzahl Claims pro Aufruf. Standard: ${DEFAULT_MAX_CLAIMS}.`,
|
|||
|
|
})
|
|||
|
|
),
|
|||
|
|
model: Type.Optional(
|
|||
|
|
Type.String({
|
|||
|
|
description: `Ollama-Modell für die Extraktion. Standard: ${DEFAULT_MODEL}. Empfohlene Alternative: qwen3.5:27b für maximale Präzision.`,
|
|||
|
|
})
|
|||
|
|
),
|
|||
|
|
});
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// Pi-Extension: Default Export
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
export default function claimExtractorExtension(pi: ExtensionAPI) {
|
|||
|
|
pi.registerTool({
|
|||
|
|
name: "extract_claims",
|
|||
|
|
label: "Claim-Extraktion",
|
|||
|
|
description:
|
|||
|
|
"Zerlegt einen Text in einzelne, diskrete Behauptungen (Claims) als Vorbereitung für Fact-Checking. " +
|
|||
|
|
"Nutze dieses Tool wenn: ein Artikel auf Fakten geprüft werden soll, Behauptungen aus einem Text " +
|
|||
|
|
"identifiziert und klassifiziert werden sollen, oder ein Verifikations-Workflow gestartet werden soll. " +
|
|||
|
|
"Läuft lokal via Ollama — keine API-Kosten.",
|
|||
|
|
promptGuidelines: [
|
|||
|
|
"Use extract_claims when the user wants to fact-check an article, blog post, or any text.",
|
|||
|
|
"Use extract_claims before calling verify or research_web on specific claims.",
|
|||
|
|
"Pass the full text as the 'text' parameter — do not summarize or shorten it first.",
|
|||
|
|
"If the user only wants checkable claims, set onlyCheckable=true.",
|
|||
|
|
"After extraction, ask the user which claims they want to verify, or offer to run the verifier on all checkable claims.",
|
|||
|
|
"The claim_ids (c001, c002, ...) can be referenced in follow-up tool calls to the verifier.",
|
|||
|
|
"Always show the full formatted output to the user, including the [Ollama: ...] cost line.",
|
|||
|
|
],
|
|||
|
|
parameters: PARAMS,
|
|||
|
|
async execute(_toolCallId, params, signal) {
|
|||
|
|
const model = params.model ?? DEFAULT_MODEL;
|
|||
|
|
const maxClaims = Math.min(params.maxClaims ?? DEFAULT_MAX_CLAIMS, 60);
|
|||
|
|
const onlyCheckable = params.onlyCheckable ?? false;
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
const { claimSet, tokensIn, tokensOut, latencyMs } = await callOllamaClaimExtract(
|
|||
|
|
params.text,
|
|||
|
|
model,
|
|||
|
|
maxClaims,
|
|||
|
|
signal
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
const text = formatClaimSet(
|
|||
|
|
claimSet,
|
|||
|
|
onlyCheckable,
|
|||
|
|
model,
|
|||
|
|
tokensIn,
|
|||
|
|
tokensOut,
|
|||
|
|
latencyMs
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
content: [{ type: "text", text }],
|
|||
|
|
details: {
|
|||
|
|
model,
|
|||
|
|
totalClaims: claimSet.total_claims,
|
|||
|
|
checkableClaims: claimSet.claims.filter((c) => c.checkability === "checkable").length,
|
|||
|
|
textLanguage: claimSet.text_language,
|
|||
|
|
tokensIn: tokensIn || null,
|
|||
|
|
tokensOut: tokensOut || null,
|
|||
|
|
latencyMs,
|
|||
|
|
},
|
|||
|
|
};
|
|||
|
|
} catch (err) {
|
|||
|
|
const msg = err instanceof Error ? err.message : "Unbekannter Fehler";
|
|||
|
|
return {
|
|||
|
|
content: [{ type: "text", text: `Fehler bei Claim-Extraktion: ${msg}` }],
|
|||
|
|
};
|
|||
|
|
}
|
|||
|
|
},
|
|||
|
|
});
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
// CLI-Modus
|
|||
|
|
// ---------------------------------------------------------------------------
|
|||
|
|
|
|||
|
|
function parseCliArgs(args: string[]): {
|
|||
|
|
text: string;
|
|||
|
|
model: string;
|
|||
|
|
maxClaims: number;
|
|||
|
|
onlyCheckable: boolean;
|
|||
|
|
jsonOutput: boolean;
|
|||
|
|
verbose: boolean;
|
|||
|
|
} {
|
|||
|
|
let model = DEFAULT_MODEL;
|
|||
|
|
let maxClaims = DEFAULT_MAX_CLAIMS;
|
|||
|
|
let onlyCheckable = false;
|
|||
|
|
let jsonOutput = false;
|
|||
|
|
let verbose = false;
|
|||
|
|
const textParts: string[] = [];
|
|||
|
|
|
|||
|
|
for (let i = 0; i < args.length; i++) {
|
|||
|
|
const arg = args[i];
|
|||
|
|
if (arg === "--model" && args[i + 1]) {
|
|||
|
|
model = args[++i];
|
|||
|
|
} else if (arg === "--max-claims" && args[i + 1]) {
|
|||
|
|
maxClaims = parseInt(args[++i], 10);
|
|||
|
|
} else if (arg === "--only-checkable") {
|
|||
|
|
onlyCheckable = true;
|
|||
|
|
} else if (arg === "--json") {
|
|||
|
|
jsonOutput = true;
|
|||
|
|
} else if (arg === "--verbose" || arg === "-v") {
|
|||
|
|
verbose = true;
|
|||
|
|
} else if (!arg.startsWith("--")) {
|
|||
|
|
textParts.push(arg);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const text = textParts.join(" ").trim();
|
|||
|
|
return { text, model, maxClaims, onlyCheckable, jsonOutput, verbose };
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
async function runCli() {
|
|||
|
|
const args = process.argv.slice(2);
|
|||
|
|
|
|||
|
|
if (args.length === 0 || args[0] === "--help" || args[0] === "-h") {
|
|||
|
|
console.log(`
|
|||
|
|
Claim-Extraktor (Ollama) — Behauptungen aus Text extrahieren
|
|||
|
|
|
|||
|
|
Verwendung:
|
|||
|
|
npx tsx agenten/ollama-claim-extractor.ts [Optionen] "Text..."
|
|||
|
|
|
|||
|
|
Optionen:
|
|||
|
|
--model <name> Ollama-Modell (Standard: ${DEFAULT_MODEL})
|
|||
|
|
--max-claims <n> Maximale Claims (Standard: ${DEFAULT_MAX_CLAIMS})
|
|||
|
|
--only-checkable Nur prüfbare Claims anzeigen
|
|||
|
|
--json Ausgabe als reines JSON (ClaimSet)
|
|||
|
|
--verbose, -v Ausführliche Ausgabe + Log-Datei in ~/.pi/agent/logs/
|
|||
|
|
--help Diese Hilfe
|
|||
|
|
|
|||
|
|
Beispiele:
|
|||
|
|
npx tsx agenten/ollama-claim-extractor.ts "Die Erde hat 8 Milliarden Einwohner."
|
|||
|
|
npx tsx agenten/ollama-claim-extractor.ts --only-checkable "$(cat artikel.txt)"
|
|||
|
|
npx tsx agenten/ollama-claim-extractor.ts --verbose "$(cat langer-artikel.txt)"
|
|||
|
|
npx tsx agenten/ollama-claim-extractor.ts --model deepseek-r1:32b "..."
|
|||
|
|
npx tsx agenten/ollama-claim-extractor.ts --json "..." > claims.json
|
|||
|
|
`);
|
|||
|
|
process.exit(0);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const { text, model, maxClaims, onlyCheckable, jsonOutput, verbose } = parseCliArgs(args);
|
|||
|
|
|
|||
|
|
if (!text) {
|
|||
|
|
console.error("Fehler: Kein Text übergeben. Nutze --help für Hinweise.");
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!jsonOutput) {
|
|||
|
|
console.error(
|
|||
|
|
`\nOllama-Modell: ${model} | Max. Claims: ${maxClaims} | Nur prüfbar: ${onlyCheckable}\n`
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
const log = createLogger({ verbose });
|
|||
|
|
|
|||
|
|
try {
|
|||
|
|
const { claimSet, tokensIn, tokensOut, latencyMs } = await callOllamaClaimExtract(
|
|||
|
|
text,
|
|||
|
|
model,
|
|||
|
|
maxClaims,
|
|||
|
|
undefined,
|
|||
|
|
log
|
|||
|
|
);
|
|||
|
|
|
|||
|
|
if (jsonOutput) {
|
|||
|
|
console.log(JSON.stringify(claimSet, null, 2));
|
|||
|
|
} else {
|
|||
|
|
console.log(
|
|||
|
|
formatClaimSet(claimSet, onlyCheckable, model, tokensIn, tokensOut, latencyMs)
|
|||
|
|
);
|
|||
|
|
}
|
|||
|
|
} catch (err) {
|
|||
|
|
console.error("Fehler:", err instanceof Error ? err.message : err);
|
|||
|
|
process.exit(1);
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Einstiegspunkt für CLI — wird ignoriert wenn als Pi-Extension geladen
|
|||
|
|
const __filename = fileURLToPath(import.meta.url);
|
|||
|
|
if (process.argv[1] === __filename) {
|
|||
|
|
runCli();
|
|||
|
|
}
|