From 8ecade5cdc2e260b1a6d38dbdbfa2922778e14b2 Mon Sep 17 00:00:00 2001
From: dschlueter <dschlueter@kitux.de>
Date: Sun, 15 Feb 2026 02:26:58 +0100
Subject: [PATCH] Add --from-text mode and improve LLM parser robustness

- Add --from-text/-t option to scan and process commands for
  pre-formatted tracklists (e.g. from Perplexity)
- Refactor llm_parser to use Chat API instead of Generate API
- Reuse _extract_json() from vision_llm for robust JSON extraction
- Improve SYSTEM_PROMPT with strict rules (Various Artists, no
  invented years, no composer info in titles, /no_think)
- Remove format:"json" constraint that caused empty responses

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/musiksammlung/cli.py        |  67 ++++++++++++++------
 src/musiksammlung/llm_parser.py | 108 +++++++++++++++++---------------
 2 files changed, 106 insertions(+), 69 deletions(-)

diff --git a/src/musiksammlung/cli.py b/src/musiksammlung/cli.py
index 5485bb3..4f886e3 100644
--- a/src/musiksammlung/cli.py
+++ b/src/musiksammlung/cli.py
@@ -28,8 +28,9 @@ logging.basicConfig(
 )
 
 
-def _scan_images(
-    images: list[Path],
+def _scan_to_album(
+    images: list[Path] | None,
+    from_text: Path | None,
     vision: bool,
     vision_model: str,
     languages: str,
@@ -37,8 +38,14 @@ def _scan_images(
     model: str,
     base_url: str,
 ) -> Album:
-    """Gemeinsame Scan-Logik für scan und process."""
-    if vision:
+    """Gemeinsame Scan-Logik: Text-Datei, Vision-LLM oder OCR+LLM."""
+    if from_text:
+        text = from_text.read_text(encoding="utf-8")
+        typer.echo(f"Text-Datei geladen ({len(text)} Zeichen). LLM-Parsing...")
+        return parse_tracklist(
+            text, backend=backend, model=model, base_url=base_url
+        )
+    elif vision:
         typer.echo(f"Vision-LLM ({vision_model})...")
         return parse_image(images, model=vision_model, base_url=base_url)
     else:
@@ -63,10 +70,16 @@ def _print_album_summary(album: Album) -> None:
 
 @app.command()
 def scan(
-    images: list[Path] = typer.Argument(..., help="Bilder der CD-Rückseite/Booklet"),
+    images: list[Path] = typer.Argument(
+        None, help="Bilder der CD-Rückseite/Booklet"
+    ),
     output: Path = typer.Option(
         "album.json", "--output", "-o", help="Ausgabe-JSON-Datei"
     ),
+    from_text: Path = typer.Option(
+        None, "--from-text", "-t",
+        help="Text/Markdown-Datei mit Trackliste (z.B. von Perplexity)",
+    ),
     vision: bool = typer.Option(
         False, "--vision", "-v", help="Vision-LLM statt OCR+Text-LLM"
     ),
@@ -80,18 +93,31 @@ def scan(
         "http://localhost:11434", "--url", help="LLM-API-URL"
     ),
 ) -> None:
-    """Bilder → Album-JSON erzeugen (zur Prüfung vor dem Anwenden).
+    """Bilder oder Text → Album-JSON erzeugen (zur Prüfung vor dem Anwenden).
 
-    Mit --vision wird ein Vision-LLM (z.B. qwen3-vl) direkt auf die Bilder
-    angewendet. Ohne --vision wird Tesseract-OCR + Text-LLM verwendet.
+    Drei Modi:
+      --from-text  Textdatei (z.B. von Perplexity) → LLM → JSON
+      --vision     Bild → Vision-LLM → JSON
+      (Standard)   Bild → Tesseract-OCR → Text-LLM → JSON
     """
-    for img in images:
-        if not img.exists():
-            typer.echo(f"Fehler: Bild nicht gefunden: {img}", err=True)
+    if from_text:
+        if not from_text.exists():
+            typer.echo(f"Fehler: Datei nicht gefunden: {from_text}", err=True)
             raise typer.Exit(1)
+    elif not images:
+        typer.echo(
+            "Fehler: Bilder oder --from-text angeben.", err=True
+        )
+        raise typer.Exit(1)
+    else:
+        for img in images:
+            if not img.exists():
+                typer.echo(f"Fehler: Bild nicht gefunden: {img}", err=True)
+                raise typer.Exit(1)
 
-    album = _scan_images(
-        images, vision, vision_model, languages, backend, model, base_url
+    album = _scan_to_album(
+        images, from_text, vision, vision_model,
+        languages, backend, model, base_url,
     )
 
     output.write_text(album.model_dump_json(indent=2), encoding="utf-8")
@@ -156,6 +182,10 @@ def process(
     images: list[Path] | None = typer.Option(
         None, "--image", "-i", help="Zusätzliche Bilder für Scan"
     ),
+    from_text: Path = typer.Option(
+        None, "--from-text", "-t",
+        help="Text/Markdown-Datei mit Trackliste (z.B. von Perplexity)",
+    ),
     vision: bool = typer.Option(
         False, "--vision", "-v", help="Vision-LLM statt OCR+Text-LLM"
     ),
@@ -175,16 +205,17 @@ def process(
     if images:
         scan_sources.extend(images)
 
-    if not scan_sources:
+    if not from_text and not scan_sources:
         typer.echo(
-            "Fehler: Mindestens ein Bild nötig (--back oder --image)", err=True
+            "Fehler: --from-text, --back oder --image angeben.", err=True
         )
         raise typer.Exit(1)
 
-    # 1. Scan (Vision oder OCR+LLM)
+    # 1. Scan (Text-Datei, Vision oder OCR+LLM)
     typer.echo("Schritt 1/4: Bilderkennung...")
-    album = _scan_images(
-        scan_sources, vision, vision_model, languages, backend, model, base_url
+    album = _scan_to_album(
+        scan_sources or None, from_text, vision, vision_model,
+        languages, backend, model, base_url,
     )
     _print_album_summary(album)
 
diff --git a/src/musiksammlung/llm_parser.py b/src/musiksammlung/llm_parser.py
index 295c7c7..1b0a98e 100644
--- a/src/musiksammlung/llm_parser.py
+++ b/src/musiksammlung/llm_parser.py
@@ -1,4 +1,4 @@
-"""LLM-basiertes Parsing von OCR-Text zu strukturierten Album-Daten."""
+"""LLM-basiertes Parsing von Text zu strukturierten Album-Daten."""
 
 from __future__ import annotations
 
@@ -9,60 +9,52 @@ import httpx
 from pydantic import ValidationError
 
 from musiksammlung.models import Album
+from musiksammlung.vision_llm import _extract_json
 
 logger = logging.getLogger(__name__)
 
 SYSTEM_PROMPT = """\
-Du bist ein Parser für CD-Rückseiten und Tracklisten.
-Analysiere den OCR-Text und extrahiere: Artist, Albumtitel, Jahr (falls vorhanden) \
-und für jede CD die Tracks in korrekter Reihenfolge.
-Ignoriere Werbung, Copyright-Hinweise und Kleingedrucktes.
+Du bist ein Parser für CD-Tracklisten. Extrahiere die Metadaten als JSON.
 
-Regeln:
-- Wenn es Hinweise wie "CD 1", "CD 2", "Disc 1", "Disc 2" gibt, ordne die Tracks \
-  der entsprechenden disc_number zu.
-- Ohne Disc-Angabe: alles als disc_number=1 behandeln.
-- Zusätze wie "live", "bonus track", "remastered" gehören in den Tracktitel.
-- Bei Unsicherheit: Feld weglassen oder null setzen, nichts erfinden.
+REGELN:
+- "artist": Wenn verschiedene Interpreten pro Track → "Various Artists". \
+NUR wenn alle Tracks denselben Interpreten haben, nimm diesen als artist.
+- "album": Der Albumtitel (z.B. "Deutsche Volkslieder").
+- "year": NUR wenn ein Jahr explizit im Text steht. Sonst null. NICHTS ERFINDEN.
+- "title": NUR der Songtitel. KEINE Komponisten, KEINE Interpreten, KEINE Zeitangaben.
+  Beispiel: "Wer recht in Freuden wandern will" — NICHT \
+"Wer recht in Freuden wandern will (Klauer – Geibel)"
+- Jede Tracknummer darf nur EINMAL vorkommen. Keine Duplikate.
+- "CD 1", "CD 2", "Disc 1" etc. → eigene disc_number. Sonst disc_number=1.
 
-Gib ausschließlich valides JSON zurück, kein anderer Text. Format:
-{
-  "artist": "...",
-  "album": "...",
-  "year": 1987,
-  "discs": [
-    {
-      "disc_number": 1,
-      "name": null,
-      "tracks": [
-        {"track_number": 1, "title": "..."},
-        {"track_number": 2, "title": "..."}
-      ]
-    }
-  ]
-}
-"""
+Gib ausschließlich valides JSON zurück:
+{"artist":"Various Artists","album":"Albumname","year":null,\
+"discs":[{"disc_number":1,"name":null,\
+"tracks":[{"track_number":1,"title":"Nur der Songtitel"}]}]}
+
+/no_think"""
 
 
-def _call_ollama(ocr_text: str, model: str, base_url: str) -> str:
-    """Ruft Ollama-API auf und gibt die Antwort als String zurück."""
+def _call_ollama(text: str, model: str, base_url: str) -> str:
+    """Ruft Ollama Chat-API auf und gibt die Antwort als String zurück."""
     response = httpx.post(
-        f"{base_url}/api/generate",
+        f"{base_url}/api/chat",
         json={
             "model": model,
-            "system": SYSTEM_PROMPT,
-            "prompt": ocr_text,
+            "messages": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": text},
+            ],
             "stream": False,
-            "format": "json",
         },
         timeout=120.0,
     )
     response.raise_for_status()
-    return response.json()["response"]
+    return response.json()["message"]["content"]
 
 
 def _call_openai_compatible(
-    ocr_text: str, model: str, base_url: str, api_key: str | None = None
+    text: str, model: str, base_url: str, api_key: str | None = None
 ) -> str:
     """Ruft eine OpenAI-kompatible API auf (OpenAI, Anthropic via Proxy, etc.)."""
     headers = {}
@@ -76,9 +68,8 @@ def _call_openai_compatible(
             "model": model,
             "messages": [
                 {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": ocr_text},
+                {"role": "user", "content": text},
             ],
-            "response_format": {"type": "json_object"},
         },
         timeout=120.0,
     )
@@ -87,17 +78,17 @@ def _call_openai_compatible(
 
 
 def parse_tracklist(
-    ocr_text: str,
+    text: str,
     backend: str = "ollama",
     model: str = "llama3",
     base_url: str = "http://localhost:11434",
     api_key: str | None = None,
-    max_retries: int = 2,
+    max_retries: int = 3,
 ) -> Album:
-    """Parst OCR-Text via LLM zu einem Album-Modell.
+    """Parst Text (OCR oder Klartext) via LLM zu einem Album-Modell.
 
     Args:
-        ocr_text: Rohtext aus der OCR-Erkennung
+        text: Eingabetext (OCR-Rohtext oder saubere Trackliste)
         backend: 'ollama' oder 'openai'
         model: Modellname
         base_url: API-Basis-URL
@@ -107,20 +98,35 @@ def parse_tracklist(
     Returns:
         Validiertes Album-Objekt
     """
+    last_error: Exception | None = None
+
     for attempt in range(max_retries + 1):
         try:
             if backend == "ollama":
-                raw = _call_ollama(ocr_text, model, base_url)
+                raw = _call_ollama(text, model, base_url)
             else:
-                raw = _call_openai_compatible(ocr_text, model, base_url, api_key)
+                raw = _call_openai_compatible(text, model, base_url, api_key)
 
-            data = json.loads(raw)
+            logger.info(
+                "LLM Antwort (Versuch %d, %d Zeichen)",
+                attempt + 1, len(raw),
+            )
+            logger.debug("Rohantwort: %s", raw[:1000])
+
+            json_str = _extract_json(raw)
+            data = json.loads(json_str)
             album = Album.model_validate(data)
-            logger.info("LLM-Parsing erfolgreich: %s - %s", album.artist, album.album)
+            logger.info(
+                "LLM-Parsing erfolgreich: %s - %s", album.artist, album.album
+            )
             return album
 
-        except (json.JSONDecodeError, ValidationError) as e:
-            logger.warning("Versuch %d/%d fehlgeschlagen: %s", attempt + 1, max_retries + 1, e)
-            if attempt == max_retries:
-                msg = f"LLM lieferte nach {max_retries + 1} Versuchen kein valides JSON"
-                raise ValueError(msg) from e
+        except (json.JSONDecodeError, ValidationError, ValueError) as e:
+            last_error = e
+            logger.warning(
+                "Versuch %d/%d fehlgeschlagen: %s",
+                attempt + 1, max_retries + 1, e,
+            )
+
+    msg = f"LLM lieferte nach {max_retries + 1} Versuchen kein valides JSON"
+    raise ValueError(msg) from last_error