#!/usr/bin/env bash # tests/run_corpus.sh # Führt alle Testkorpus-Fälle durch verify-article und berechnet Precision/Recall. # # Verwendung: # cd ~/Pi_Agent_Projekts/text_agent # bash tests/run_corpus.sh # Alle Fälle # bash tests/run_corpus.sh case_001 case_002 # Nur bestimmte Fälle # bash tests/run_corpus.sh --mode deep # Perplexity-Modus # bash tests/run_corpus.sh --no-cache # Cache umgehen # # Ausgabe: # tests/results// ← JSON-Reports pro Fall # tests/results//summary.txt ← Precision/Recall-Zusammenfassung set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" CORPUS_DIR="${SCRIPT_DIR}/corpus" AGENT="${SCRIPT_DIR}/../agenten/llama-verify-article.ts" TIMESTAMP="$(date +%Y-%m-%d_%H-%M-%S)" RESULTS_DIR="${SCRIPT_DIR}/results/${TIMESTAMP}" # --------------------------------------------------------------------------- # Argument-Parsing # --------------------------------------------------------------------------- MODE="fast" EXTRA_FLAGS="" SELECTED_CASES=() for arg in "$@"; do case "$arg" in --mode) shift; MODE="$1" ;; --mode=*) MODE="${arg#--mode=}" ;; --no-cache) EXTRA_FLAGS="${EXTRA_FLAGS} --no-cache" ;; case_*) SELECTED_CASES+=("$arg") ;; *) ;; esac done # --------------------------------------------------------------------------- # Setup # --------------------------------------------------------------------------- mkdir -p "${RESULTS_DIR}" # Hilfsfunktionen green() { echo -e "\033[0;32m$*\033[0m"; } red() { echo -e "\033[0;31m$*\033[0m"; } yellow() { echo -e "\033[0;33m$*\033[0m"; } bold() { echo -e "\033[1m$*\033[0m"; } # --------------------------------------------------------------------------- # Fälle ermitteln # --------------------------------------------------------------------------- if [ ${#SELECTED_CASES[@]} -eq 0 ]; then mapfile -t CASES < <(ls -d "${CORPUS_DIR}"/case_* 2>/dev/null | xargs -I{} basename {}) else CASES=("${SELECTED_CASES[@]}") fi if [ ${#CASES[@]} -eq 0 ]; then echo "Keine Fälle in ${CORPUS_DIR} gefunden." exit 1 fi bold "Pi Text-Agent — Testkorpus-Auswertung" echo "Modus: ${MODE} | Fälle: ${#CASES[@]} | Ergebnisse: ${RESULTS_DIR}" echo "" # --------------------------------------------------------------------------- # Metriken (Globale Zähler) # --------------------------------------------------------------------------- TOTAL_CLAIMS=0 TRUE_POS=0 # Erwartet X → tatsächlich X FALSE_POS=0 # Erwartet NOT contradicted → tatsächlich contradicted FALSE_NEG=0 # Erwartet contradicted → tatsächlich NOT contradicted TRUE_NEG=0 # Erwartet NOT contradicted → tatsächlich NOT contradicted CASE_PASS=0 CASE_FAIL=0 CASE_ERROR=0 TOTAL_COST=0 TOTAL_TIME=0 # --------------------------------------------------------------------------- # Pro-Fall-Verarbeitung # --------------------------------------------------------------------------- for case_name in "${CASES[@]}"; do case_dir="${CORPUS_DIR}/${case_name}" input_file="${case_dir}/input.txt" expected_file="${case_dir}/expected.json" if [ ! -f "${input_file}" ]; then yellow " ${case_name}: input.txt nicht gefunden — übersprungen" continue fi if [ ! -f "${expected_file}" ]; then yellow " ${case_name}: expected.json nicht gefunden — übersprungen" continue fi echo -n " ${case_name}: " result_file="${RESULTS_DIR}/${case_name}.json" t_start=$(date +%s%3N) # verify-article aufrufen (--job-id cacht claims.json zwischen Läufen) if npx tsx "${AGENT}" \ --mode "${MODE}" \ --job-id "corpus-${case_name}" \ --json \ ${EXTRA_FLAGS} \ "$(cat "${input_file}")" \ > "${result_file}" 2>/dev/null; then t_end=$(date +%s%3N) elapsed_ms=$((t_end - t_start)) else t_end=$(date +%s%3N) elapsed_ms=$((t_end - t_start)) red "FEHLER (${elapsed_ms}ms)" CASE_ERROR=$((CASE_ERROR + 1)) echo " Fehlerhafter Exit-Code von verify-article" >> "${RESULTS_DIR}/errors.log" continue fi # Kosten aus Report cost=$(python3 -c " import json, sys try: r = json.load(open('${result_file}')) print(r.get('totalCostUSD', 0)) except: print(0) " 2>/dev/null || echo "0") TOTAL_COST=$(python3 -c "print(${TOTAL_COST} + ${cost})" 2>/dev/null || echo "${TOTAL_COST}") TOTAL_TIME=$((TOTAL_TIME + elapsed_ms)) # Erwartungen prüfen case_pass=true claim_results="" while IFS= read -r expected_claim; do text_contains=$(echo "${expected_claim}" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(d.get('text_contains',''))" 2>/dev/null) expected_status=$(echo "${expected_claim}" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(d.get('expected_status',''))" 2>/dev/null) note=$(echo "${expected_claim}" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(d.get('note',''))" 2>/dev/null) if [ -z "${text_contains}" ] || [ -z "${expected_status}" ]; then continue fi TOTAL_CLAIMS=$((TOTAL_CLAIMS + 1)) # Tatsächlichen Status aus Report ermitteln actual_status=$(python3 -c " import json, sys try: report = json.load(open('${result_file}')) needle = '${text_contains}'.lower() for r in report.get('results', []): if needle in r.get('claim_text', '').lower(): print(r.get('status', 'not_found')) sys.exit(0) print('not_found') except Exception as e: print('error') " 2>/dev/null) # Metriken aktualisieren if [ "${expected_status}" = "${actual_status}" ]; then # Exakter Match if [ "${expected_status}" = "contradicted" ]; then TRUE_POS=$((TRUE_POS + 1)) else TRUE_NEG=$((TRUE_NEG + 1)) fi claim_results="${claim_results}\n ✓ [${actual_status}] ${text_contains:0:50}" else # Mismatch case_pass=false if [ "${expected_status}" = "contradicted" ] && [ "${actual_status}" != "contradicted" ]; then FALSE_NEG=$((FALSE_NEG + 1)) claim_results="${claim_results}\n ✗ Erwartet contradicted, bekam ${actual_status}: ${text_contains:0:50}" elif [ "${expected_status}" != "contradicted" ] && [ "${actual_status}" = "contradicted" ]; then FALSE_POS=$((FALSE_POS + 1)) claim_results="${claim_results}\n ✗ Falsch widersprüchlich: ${text_contains:0:50}" else # z.B. supported vs mixed claim_results="${claim_results}\n ~ Erwartet ${expected_status}, bekam ${actual_status}: ${text_contains:0:50}" fi fi done < <(python3 -c " import json data = json.load(open('${expected_file}')) for c in data.get('claims', []): print(json.dumps(c)) " 2>/dev/null) if [ "${case_pass}" = true ]; then green "OK (${elapsed_ms}ms, \$${cost})" CASE_PASS=$((CASE_PASS + 1)) else red "FEHLGESCHLAGEN (${elapsed_ms}ms, \$${cost})" CASE_FAIL=$((CASE_FAIL + 1)) fi if [ -n "${claim_results}" ]; then echo -e "${claim_results}" fi done # --------------------------------------------------------------------------- # Zusammenfassung # --------------------------------------------------------------------------- echo "" bold "==============================" bold "Ergebnisse" bold "==============================" echo "" echo "Fälle: ${CASE_PASS} OK | ${CASE_FAIL} fehlgeschlagen | ${CASE_ERROR} Fehler" echo "Claims: ${TOTAL_CLAIMS} geprüft" echo "" # Precision (wie viele der als contradicted markierten sind wirklich falsch) if [ $((TRUE_POS + FALSE_POS)) -gt 0 ]; then precision=$(python3 -c "print(f'{${TRUE_POS} / (${TRUE_POS} + ${FALSE_POS}) * 100:.1f}%')" 2>/dev/null || echo "n/a") else precision="n/a (keine contradicted-Urteile)" fi # Recall (wie viele der wirklich falschen Claims wurden erkannt) if [ $((TRUE_POS + FALSE_NEG)) -gt 0 ]; then recall=$(python3 -c "print(f'{${TRUE_POS} / (${TRUE_POS} + ${FALSE_NEG}) * 100:.1f}%')" 2>/dev/null || echo "n/a") else recall="n/a (keine erwarteten contradicted-Claims)" fi echo "Precision: ${precision} (Anteil korrekt widerlegter unter allen Widerlegungen)" echo "Recall: ${recall} (Anteil erkannter Fehler unter allen bekannten Fehlern)" echo "" echo "True Positives: ${TRUE_POS} (korrekter contradicted-Fund)" echo "False Positives: ${FALSE_POS} (fälschlich widerlegter korrekter Fakt)" echo "False Negatives: ${FALSE_NEG} (nicht erkannter Fehler)" echo "True Negatives: ${TRUE_NEG} (korrekt als nicht-widerlegbar bewertet)" echo "" echo "Kosten: \$${TOTAL_COST}" echo "Zeit: $((TOTAL_TIME / 1000))s total" echo "" echo "Reports: ${RESULTS_DIR}/" # Zusammenfassung speichern { echo "Testlauf: ${TIMESTAMP}" echo "Modus: ${MODE}" echo "Fälle: ${CASE_PASS} OK | ${CASE_FAIL} fehlgeschlagen | ${CASE_ERROR} Fehler" echo "Claims: ${TOTAL_CLAIMS} geprüft" echo "Precision: ${precision}" echo "Recall: ${recall}" echo "TP=${TRUE_POS} FP=${FALSE_POS} FN=${FALSE_NEG} TN=${TRUE_NEG}" echo "Kosten: \$${TOTAL_COST}" echo "Zeit: $((TOTAL_TIME / 1000))s" } > "${RESULTS_DIR}/summary.txt" # Exit-Code: 0 wenn alle Fälle bestanden if [ "${CASE_FAIL}" -gt 0 ] || [ "${CASE_ERROR}" -gt 0 ]; then exit 1 fi exit 0