2026-05-12 04:21:48 +02:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
# tests/run_corpus.sh
|
|
|
|
|
# Führt alle Testkorpus-Fälle durch verify-article und berechnet Precision/Recall.
|
|
|
|
|
#
|
|
|
|
|
# Verwendung:
|
|
|
|
|
# cd ~/Pi_Agent_Projekts/text_agent
|
|
|
|
|
# bash tests/run_corpus.sh # Alle Fälle
|
|
|
|
|
# bash tests/run_corpus.sh case_001 case_002 # Nur bestimmte Fälle
|
|
|
|
|
# bash tests/run_corpus.sh --mode deep # Perplexity-Modus
|
|
|
|
|
# bash tests/run_corpus.sh --no-cache # Cache umgehen
|
|
|
|
|
#
|
|
|
|
|
# Ausgabe:
|
|
|
|
|
# tests/results/<timestamp>/ ← JSON-Reports pro Fall
|
|
|
|
|
# tests/results/<timestamp>/summary.txt ← Precision/Recall-Zusammenfassung
|
|
|
|
|
|
|
|
|
|
set -euo pipefail
|
|
|
|
|
|
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
|
CORPUS_DIR="${SCRIPT_DIR}/corpus"
|
|
|
|
|
AGENT="${SCRIPT_DIR}/../agenten/llama-verify-article.ts"
|
|
|
|
|
TIMESTAMP="$(date +%Y-%m-%d_%H-%M-%S)"
|
|
|
|
|
RESULTS_DIR="${SCRIPT_DIR}/results/${TIMESTAMP}"
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Argument-Parsing
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
MODE="fast"
|
|
|
|
|
EXTRA_FLAGS=""
|
|
|
|
|
SELECTED_CASES=()
|
|
|
|
|
|
|
|
|
|
for arg in "$@"; do
|
|
|
|
|
case "$arg" in
|
|
|
|
|
--mode) shift; MODE="$1" ;;
|
|
|
|
|
--mode=*) MODE="${arg#--mode=}" ;;
|
|
|
|
|
--no-cache) EXTRA_FLAGS="${EXTRA_FLAGS} --no-cache" ;;
|
|
|
|
|
case_*) SELECTED_CASES+=("$arg") ;;
|
|
|
|
|
*) ;;
|
|
|
|
|
esac
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Setup
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
mkdir -p "${RESULTS_DIR}"
|
|
|
|
|
|
|
|
|
|
# Hilfsfunktionen
|
|
|
|
|
green() { echo -e "\033[0;32m$*\033[0m"; }
|
|
|
|
|
red() { echo -e "\033[0;31m$*\033[0m"; }
|
|
|
|
|
yellow() { echo -e "\033[0;33m$*\033[0m"; }
|
|
|
|
|
bold() { echo -e "\033[1m$*\033[0m"; }
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Fälle ermitteln
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
if [ ${#SELECTED_CASES[@]} -eq 0 ]; then
|
|
|
|
|
mapfile -t CASES < <(ls -d "${CORPUS_DIR}"/case_* 2>/dev/null | xargs -I{} basename {})
|
|
|
|
|
else
|
|
|
|
|
CASES=("${SELECTED_CASES[@]}")
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if [ ${#CASES[@]} -eq 0 ]; then
|
|
|
|
|
echo "Keine Fälle in ${CORPUS_DIR} gefunden."
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
bold "Pi Text-Agent — Testkorpus-Auswertung"
|
|
|
|
|
echo "Modus: ${MODE} | Fälle: ${#CASES[@]} | Ergebnisse: ${RESULTS_DIR}"
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Metriken (Globale Zähler)
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
TOTAL_CLAIMS=0
|
|
|
|
|
TRUE_POS=0 # Erwartet X → tatsächlich X
|
|
|
|
|
FALSE_POS=0 # Erwartet NOT contradicted → tatsächlich contradicted
|
|
|
|
|
FALSE_NEG=0 # Erwartet contradicted → tatsächlich NOT contradicted
|
|
|
|
|
TRUE_NEG=0 # Erwartet NOT contradicted → tatsächlich NOT contradicted
|
|
|
|
|
|
|
|
|
|
CASE_PASS=0
|
|
|
|
|
CASE_FAIL=0
|
|
|
|
|
CASE_ERROR=0
|
|
|
|
|
|
|
|
|
|
TOTAL_COST=0
|
|
|
|
|
TOTAL_TIME=0
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Pro-Fall-Verarbeitung
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
for case_name in "${CASES[@]}"; do
|
|
|
|
|
case_dir="${CORPUS_DIR}/${case_name}"
|
|
|
|
|
input_file="${case_dir}/input.txt"
|
|
|
|
|
expected_file="${case_dir}/expected.json"
|
|
|
|
|
|
|
|
|
|
if [ ! -f "${input_file}" ]; then
|
|
|
|
|
yellow " ${case_name}: input.txt nicht gefunden — übersprungen"
|
|
|
|
|
continue
|
|
|
|
|
fi
|
|
|
|
|
if [ ! -f "${expected_file}" ]; then
|
|
|
|
|
yellow " ${case_name}: expected.json nicht gefunden — übersprungen"
|
|
|
|
|
continue
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
echo -n " ${case_name}: "
|
|
|
|
|
result_file="${RESULTS_DIR}/${case_name}.json"
|
|
|
|
|
t_start=$(date +%s%3N)
|
|
|
|
|
|
2026-05-12 04:52:12 +02:00
|
|
|
# verify-article aufrufen (--job-id cacht claims.json zwischen Läufen)
|
2026-05-12 04:21:48 +02:00
|
|
|
if npx tsx "${AGENT}" \
|
|
|
|
|
--mode "${MODE}" \
|
2026-05-12 04:52:12 +02:00
|
|
|
--job-id "corpus-${case_name}" \
|
2026-05-12 04:21:48 +02:00
|
|
|
--json \
|
|
|
|
|
${EXTRA_FLAGS} \
|
|
|
|
|
"$(cat "${input_file}")" \
|
|
|
|
|
> "${result_file}" 2>/dev/null; then
|
|
|
|
|
t_end=$(date +%s%3N)
|
|
|
|
|
elapsed_ms=$((t_end - t_start))
|
|
|
|
|
else
|
|
|
|
|
t_end=$(date +%s%3N)
|
|
|
|
|
elapsed_ms=$((t_end - t_start))
|
|
|
|
|
red "FEHLER (${elapsed_ms}ms)"
|
|
|
|
|
CASE_ERROR=$((CASE_ERROR + 1))
|
|
|
|
|
echo " Fehlerhafter Exit-Code von verify-article" >> "${RESULTS_DIR}/errors.log"
|
|
|
|
|
continue
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# Kosten aus Report
|
|
|
|
|
cost=$(python3 -c "
|
|
|
|
|
import json, sys
|
|
|
|
|
try:
|
|
|
|
|
r = json.load(open('${result_file}'))
|
|
|
|
|
print(r.get('totalCostUSD', 0))
|
|
|
|
|
except: print(0)
|
|
|
|
|
" 2>/dev/null || echo "0")
|
|
|
|
|
TOTAL_COST=$(python3 -c "print(${TOTAL_COST} + ${cost})" 2>/dev/null || echo "${TOTAL_COST}")
|
|
|
|
|
TOTAL_TIME=$((TOTAL_TIME + elapsed_ms))
|
|
|
|
|
|
|
|
|
|
# Erwartungen prüfen
|
|
|
|
|
case_pass=true
|
|
|
|
|
claim_results=""
|
|
|
|
|
|
|
|
|
|
while IFS= read -r expected_claim; do
|
|
|
|
|
text_contains=$(echo "${expected_claim}" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(d.get('text_contains',''))" 2>/dev/null)
|
|
|
|
|
expected_status=$(echo "${expected_claim}" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(d.get('expected_status',''))" 2>/dev/null)
|
|
|
|
|
note=$(echo "${expected_claim}" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(d.get('note',''))" 2>/dev/null)
|
|
|
|
|
|
|
|
|
|
if [ -z "${text_contains}" ] || [ -z "${expected_status}" ]; then
|
|
|
|
|
continue
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
TOTAL_CLAIMS=$((TOTAL_CLAIMS + 1))
|
|
|
|
|
|
|
|
|
|
# Tatsächlichen Status aus Report ermitteln
|
|
|
|
|
actual_status=$(python3 -c "
|
|
|
|
|
import json, sys
|
|
|
|
|
try:
|
|
|
|
|
report = json.load(open('${result_file}'))
|
|
|
|
|
needle = '${text_contains}'.lower()
|
|
|
|
|
for r in report.get('results', []):
|
|
|
|
|
if needle in r.get('claim_text', '').lower():
|
|
|
|
|
print(r.get('status', 'not_found'))
|
|
|
|
|
sys.exit(0)
|
|
|
|
|
print('not_found')
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print('error')
|
|
|
|
|
" 2>/dev/null)
|
|
|
|
|
|
|
|
|
|
# Metriken aktualisieren
|
|
|
|
|
if [ "${expected_status}" = "${actual_status}" ]; then
|
|
|
|
|
# Exakter Match
|
|
|
|
|
if [ "${expected_status}" = "contradicted" ]; then
|
|
|
|
|
TRUE_POS=$((TRUE_POS + 1))
|
|
|
|
|
else
|
|
|
|
|
TRUE_NEG=$((TRUE_NEG + 1))
|
|
|
|
|
fi
|
|
|
|
|
claim_results="${claim_results}\n ✓ [${actual_status}] ${text_contains:0:50}"
|
|
|
|
|
else
|
|
|
|
|
# Mismatch
|
|
|
|
|
case_pass=false
|
|
|
|
|
if [ "${expected_status}" = "contradicted" ] && [ "${actual_status}" != "contradicted" ]; then
|
|
|
|
|
FALSE_NEG=$((FALSE_NEG + 1))
|
|
|
|
|
claim_results="${claim_results}\n ✗ Erwartet contradicted, bekam ${actual_status}: ${text_contains:0:50}"
|
|
|
|
|
elif [ "${expected_status}" != "contradicted" ] && [ "${actual_status}" = "contradicted" ]; then
|
|
|
|
|
FALSE_POS=$((FALSE_POS + 1))
|
|
|
|
|
claim_results="${claim_results}\n ✗ Falsch widersprüchlich: ${text_contains:0:50}"
|
|
|
|
|
else
|
|
|
|
|
# z.B. supported vs mixed
|
|
|
|
|
claim_results="${claim_results}\n ~ Erwartet ${expected_status}, bekam ${actual_status}: ${text_contains:0:50}"
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
done < <(python3 -c "
|
|
|
|
|
import json
|
|
|
|
|
data = json.load(open('${expected_file}'))
|
|
|
|
|
for c in data.get('claims', []):
|
|
|
|
|
print(json.dumps(c))
|
|
|
|
|
" 2>/dev/null)
|
|
|
|
|
|
|
|
|
|
if [ "${case_pass}" = true ]; then
|
|
|
|
|
green "OK (${elapsed_ms}ms, \$${cost})"
|
|
|
|
|
CASE_PASS=$((CASE_PASS + 1))
|
|
|
|
|
else
|
|
|
|
|
red "FEHLGESCHLAGEN (${elapsed_ms}ms, \$${cost})"
|
|
|
|
|
CASE_FAIL=$((CASE_FAIL + 1))
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if [ -n "${claim_results}" ]; then
|
|
|
|
|
echo -e "${claim_results}"
|
|
|
|
|
fi
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
# Zusammenfassung
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
echo ""
|
|
|
|
|
bold "=============================="
|
|
|
|
|
bold "Ergebnisse"
|
|
|
|
|
bold "=============================="
|
|
|
|
|
echo ""
|
|
|
|
|
echo "Fälle: ${CASE_PASS} OK | ${CASE_FAIL} fehlgeschlagen | ${CASE_ERROR} Fehler"
|
|
|
|
|
echo "Claims: ${TOTAL_CLAIMS} geprüft"
|
|
|
|
|
echo ""
|
|
|
|
|
|
|
|
|
|
# Precision (wie viele der als contradicted markierten sind wirklich falsch)
|
|
|
|
|
if [ $((TRUE_POS + FALSE_POS)) -gt 0 ]; then
|
|
|
|
|
precision=$(python3 -c "print(f'{${TRUE_POS} / (${TRUE_POS} + ${FALSE_POS}) * 100:.1f}%')" 2>/dev/null || echo "n/a")
|
|
|
|
|
else
|
|
|
|
|
precision="n/a (keine contradicted-Urteile)"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# Recall (wie viele der wirklich falschen Claims wurden erkannt)
|
|
|
|
|
if [ $((TRUE_POS + FALSE_NEG)) -gt 0 ]; then
|
|
|
|
|
recall=$(python3 -c "print(f'{${TRUE_POS} / (${TRUE_POS} + ${FALSE_NEG}) * 100:.1f}%')" 2>/dev/null || echo "n/a")
|
|
|
|
|
else
|
|
|
|
|
recall="n/a (keine erwarteten contradicted-Claims)"
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
echo "Precision: ${precision} (Anteil korrekt widerlegter unter allen Widerlegungen)"
|
|
|
|
|
echo "Recall: ${recall} (Anteil erkannter Fehler unter allen bekannten Fehlern)"
|
|
|
|
|
echo ""
|
|
|
|
|
echo "True Positives: ${TRUE_POS} (korrekter contradicted-Fund)"
|
|
|
|
|
echo "False Positives: ${FALSE_POS} (fälschlich widerlegter korrekter Fakt)"
|
|
|
|
|
echo "False Negatives: ${FALSE_NEG} (nicht erkannter Fehler)"
|
|
|
|
|
echo "True Negatives: ${TRUE_NEG} (korrekt als nicht-widerlegbar bewertet)"
|
|
|
|
|
echo ""
|
|
|
|
|
echo "Kosten: \$${TOTAL_COST}"
|
|
|
|
|
echo "Zeit: $((TOTAL_TIME / 1000))s total"
|
|
|
|
|
echo ""
|
|
|
|
|
echo "Reports: ${RESULTS_DIR}/"
|
|
|
|
|
|
|
|
|
|
# Zusammenfassung speichern
|
|
|
|
|
{
|
|
|
|
|
echo "Testlauf: ${TIMESTAMP}"
|
|
|
|
|
echo "Modus: ${MODE}"
|
|
|
|
|
echo "Fälle: ${CASE_PASS} OK | ${CASE_FAIL} fehlgeschlagen | ${CASE_ERROR} Fehler"
|
|
|
|
|
echo "Claims: ${TOTAL_CLAIMS} geprüft"
|
|
|
|
|
echo "Precision: ${precision}"
|
|
|
|
|
echo "Recall: ${recall}"
|
|
|
|
|
echo "TP=${TRUE_POS} FP=${FALSE_POS} FN=${FALSE_NEG} TN=${TRUE_NEG}"
|
|
|
|
|
echo "Kosten: \$${TOTAL_COST}"
|
|
|
|
|
echo "Zeit: $((TOTAL_TIME / 1000))s"
|
|
|
|
|
} > "${RESULTS_DIR}/summary.txt"
|
|
|
|
|
|
|
|
|
|
# Exit-Code: 0 wenn alle Fälle bestanden
|
|
|
|
|
if [ "${CASE_FAIL}" -gt 0 ] || [ "${CASE_ERROR}" -gt 0 ]; then
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
exit 0
|