Add OCR fallback via Ollama Vision for albums without tracklist text

hint_extractor: _ocr_back_cover() sends back/inlay images to Ollama Vision when no tracklist .txt/.htm/.nfo is present. Model priority: qwen3-vl:latest → minicpm-v:latest → deepseek-ocr:latest (configurable via OLLAMA_OCR_MODEL env var). Timeout 180s. OCR text is fed into the same _parse_tracklist() pipeline as regular text files. music_enricher: extract_hints(use_ocr=not args.no_api) — OCR is skipped with --no-api to allow fully offline/fast runs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 03:08:21 +02:00 · 2026-04-29 03:08:21 +02:00 · 40a2ef3fb6
commit 40a2ef3fb6
parent 28f716f8f2
2 changed files with 91 additions and 2 deletions
--- a/hint_extractor.py
+++ b/hint_extractor.py
@ -1,7 +1,11 @@
 from __future__ import annotations

+import base64
+import json
+import os
 import re
 import sys
+import urllib.request
 from pathlib import Path
 from typing import Optional, List, Dict, Tuple

@ -224,6 +228,85 @@ def _read_tracklist_file(path: Path) -> Optional[str]:
    return None


+_OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+# Modelle in Prioritätsreihenfolge; überschreibbar via OLLAMA_OCR_MODEL
+_OCR_MODELS = [m.strip() for m in os.getenv(
+    "OLLAMA_OCR_MODEL",
+    "qwen3-vl:latest,minicpm-v:latest,deepseek-ocr:latest"
+).split(",") if m.strip()]
+
+_OCR_PROMPT = (
+    "This image shows a CD album back cover or booklet page. "
+    "Your task: extract the complete tracklist as plain text.\n"
+    "Rules:\n"
+    "- Output track number and title per line, e.g. '1. Title' or '1-1 Title'\n"
+    "- If multiple discs/CDs: include a header like 'CD 1' or 'Disc 1' before each group\n"
+    "- Include durations if visible (e.g. '1. Title  4:32')\n"
+    "- Do NOT include label info, barcodes, or other non-tracklist text\n"
+    "- If no tracklist is visible, reply with: NO_TRACKLIST"
+)
+
+
+def _ocr_back_cover(image_files: List[Path]) -> Optional[str]:
+    """
+    OCR eines Back-Cover- oder Booklet-Bildes via Ollama Vision.
+    Gibt den erkannten Text zurück, oder None wenn nichts gefunden.
+    """
+    # Nur Bilder die nach Back-Cover aussehen
+    candidates = [
+        p for p in image_files
+        if any(kw in p.name.lower() for kw in ("back", "inlay", "booklet", "inside", "rear"))
+    ]
+    # Fallback: alle Bilder außer dem Front-Cover
+    if not candidates:
+        candidates = [
+            p for p in image_files
+            if not any(kw in p.name.lower() for kw in ("front", "folder", "cover"))
+        ]
+    if not candidates:
+        return None
+
+    image_path = candidates[0]
+    try:
+        img_b64 = base64.b64encode(image_path.read_bytes()).decode()
+    except Exception as e:
+        print(f"  ⚠️ OCR-Bild lesen {image_path.name}: {e}", file=sys.stderr)
+        return None
+
+    for model in _OCR_MODELS:
+        payload = json.dumps({
+            "model": model,
+            "messages": [{
+                "role": "user",
+                "content": _OCR_PROMPT,
+                "images": [img_b64],
+            }],
+            "stream": False,
+            "options": {"temperature": 0.0},
+        }).encode()
+        try:
+            req = urllib.request.Request(
+                f"{_OLLAMA_HOST}/api/chat",
+                data=payload,
+                headers={"Content-Type": "application/json"},
+                method="POST",
+            )
+            with urllib.request.urlopen(req, timeout=180) as resp:
+                data = json.loads(resp.read())
+            text = data.get("message", {}).get("content", "").strip()
+            if text and "NO_TRACKLIST" not in text:
+                print(f"  📷 OCR {image_path.name} via {model}: {len(text)} Zeichen extrahiert",
+                      file=sys.stderr)
+                return text
+            elif "NO_TRACKLIST" in text:
+                print(f"  📷 OCR {image_path.name}: kein Tracklist-Text erkannt", file=sys.stderr)
+                return None
+        except Exception as e:
+            print(f"  ⚠️ OCR-Fehler ({model}) {image_path.name}: {e}", file=sys.stderr)
+            continue
+    return None
+
+
 def _check_cover_images(paths: List[Path]) -> List[Path]:
    good: List[Path] = []
    for p in paths:
@ -236,7 +319,7 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
    return good


-def extract_hints(scan: AlbumScan) -> AlbumHints:
+def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
    hints = AlbumHints(album_dir=scan.album_dir)

    # Directory name
@ -253,6 +336,12 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
            texts.append(txt)
    hints.tracklist_text = "\n\n".join(texts) if texts else None

+    # OCR-Fallback: Back-Cover scannen wenn keine Tracklist-Textdatei vorhanden
+    if use_ocr and not hints.tracklist_text and scan.image_files:
+        ocr_text = _ocr_back_cover(scan.image_files)
+        if ocr_text:
+            hints.tracklist_text = ocr_text
+
    parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []

    # M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
--- a/music_enricher.py
+++ b/music_enricher.py
@ -90,7 +90,7 @@ def process_album(
            stats["skipped"] += 1
            return stats

-        hints = extract_hints(scan)
+        hints = extract_hints(scan, use_ocr=not args.no_api)

        proposal = resolve(
            hints,