Add Vision-LLM mode for direct image-to-JSON extraction

Tesseract OCR fails on rotated/low-contrast CD back covers. New vision_llm module sends images directly to qwen3-vl via Ollama chat API, bypassing OCR entirely. Robust JSON extraction handles thinking tags, markdown blocks, and empty responses. CLI scan/process commands gain --vision flag. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 01:35:05 +01:00 · 2026-02-15 01:35:05 +01:00 · 1753ab204f
commit 1753ab204f
parent 686c4317d1
5 changed files with 359 additions and 55 deletions
--- a/src/musiksammlung/vision_llm.py
+++ b/src/musiksammlung/vision_llm.py
@ -0,0 +1,150 @@
+"""Vision-LLM: Bild direkt an ein multimodales LLM senden, ohne OCR-Zwischenschritt."""
+
+from __future__ import annotations
+
+import base64
+import json
+import logging
+import re
+from pathlib import Path
+
+import httpx
+from pydantic import ValidationError
+
+from musiksammlung.models import Album
+
+logger = logging.getLogger(__name__)
+
+VISION_PROMPT = """\
+Lies das Foto einer CD-Rückseite oder eines Booklets ab. Das Bild kann gedreht sein.
+Extrahiere daraus die Metadaten und die vollständige Trackliste.
+
+WICHTIG:
+- "artist" ist der Hauptinterpret oder "Various Artists" bei Samplern/Compilations.
+- "album" ist der Albumtitel (z.B. "Deutsche Volkslieder", "Abbey Road").
+- "year" ist das Erscheinungsjahr (Zahl oder null wenn nicht sichtbar).
+- Lies die Tracktitel GENAU so ab, wie sie auf der CD stehen.
+- Achte besonders auf korrekte deutsche Umlaute (ä, ö, ü, ß).
+- Wenn "CD 1", "CD 2", "Disc 1" etc. sichtbar sind, erstelle mehrere Einträge in "discs".
+- Ohne Disc-Angabe: eine Disc mit disc_number=1.
+- Lasse Zeitangaben (z.B. "3:12") und Interpretenangaben pro Track weg.
+
+Antworte NUR mit dem JSON, ohne Erklärung. Beispiel:
+
+{"artist":"Various Artists","album":"Deutsche Volkslieder","year":null,"""  # noqa: E501
+VISION_PROMPT += """"discs":[{"disc_number":1,"name":null,"tracks":["""
+VISION_PROMPT += """{"track_number":1,"title":"Erster Song"},"""
+VISION_PROMPT += """{"track_number":2,"title":"Zweiter Song"}]}]}"""
+VISION_PROMPT += """
+
+Jetzt lies das Bild ab und gib das vollständige JSON aus. /no_think"""
+
+
+def _encode_image(image_path: Path) -> str:
+    """Liest ein Bild und gibt es als Base64-String zurück."""
+    return base64.b64encode(image_path.read_bytes()).decode("utf-8")
+
+
+def _extract_json(text: str) -> str:
+    """Extrahiert JSON aus einer LLM-Antwort.
+
+    Behandelt:
+    - Reines JSON
+    - JSON in Markdown-Codeblöcken (```json ... ```)
+    - Thinking-Tags (<think>...</think>) vor dem JSON
+    - Sonstiger Text vor/nach dem JSON
+    """
+    if not text or not text.strip():
+        raise ValueError("Leere Antwort vom Vision-LLM")
+
+    # Thinking-Tags entfernen
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
+
+    # Markdown-Codeblock extrahieren
+    md_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
+    if md_match:
+        return md_match.group(1)
+
+    # Äußerstes JSON-Objekt finden
+    brace_match = re.search(r"\{.*\}", text, re.DOTALL)
+    if brace_match:
+        return brace_match.group(0)
+
+    raise ValueError(f"Kein JSON in Antwort gefunden: {text[:200]}")
+
+
+def parse_image(
+    image_paths: list[Path],
+    model: str = "qwen3-vl:latest",
+    base_url: str = "http://localhost:11434",
+    max_retries: int = 3,
+) -> Album:
+    """Sendet Bilder direkt an ein Vision-LLM und extrahiert Album-Daten.
+
+    Args:
+        image_paths: Liste von Bilddateien (Cover-Rückseite, Booklet, etc.)
+        model: Ollama Vision-Modell
+        base_url: Ollama-API-URL
+        max_retries: Anzahl Wiederholungsversuche bei ungültigem JSON
+
+    Returns:
+        Validiertes Album-Objekt
+    """
+    images_b64 = [_encode_image(p) for p in image_paths]
+
+    messages = [
+        {
+            "role": "user",
+            "content": VISION_PROMPT,
+            "images": images_b64,
+        }
+    ]
+
+    last_error: Exception | None = None
+
+    for attempt in range(max_retries + 1):
+        try:
+            response = httpx.post(
+                f"{base_url}/api/chat",
+                json={
+                    "model": model,
+                    "messages": messages,
+                    "stream": False,
+                },
+                timeout=300.0,
+            )
+            response.raise_for_status()
+
+            raw_text = response.json()["message"]["content"]
+            logger.info(
+                "Vision-LLM Antwort (Versuch %d, %d Zeichen)",
+                attempt + 1, len(raw_text),
+            )
+            logger.debug("Rohantwort: %s", raw_text[:1000])
+
+            json_str = _extract_json(raw_text)
+            data = json.loads(json_str)
+            album = Album.model_validate(data)
+
+            logger.info(
+                "Vision-LLM erfolgreich: %s - %s (%d Discs, %d Tracks)",
+                album.artist,
+                album.album,
+                len(album.discs),
+                sum(len(d.tracks) for d in album.discs),
+            )
+            return album
+
+        except (json.JSONDecodeError, ValidationError, ValueError) as e:
+            last_error = e
+            logger.warning(
+                "Versuch %d/%d fehlgeschlagen: %s",
+                attempt + 1, max_retries + 1, e,
+            )
+
+        except httpx.HTTPStatusError as e:
+            logger.error("HTTP-Fehler vom Vision-LLM: %s", e)
+            raise
+
+    msg = f"Vision-LLM lieferte nach {max_retries + 1} Versuchen kein valides Ergebnis"
+    raise ValueError(msg) from last_error