Add OCR fallback via Ollama Vision for albums without tracklist text

hint_extractor: _ocr_back_cover() sends back/inlay images to Ollama Vision when no tracklist .txt/.htm/.nfo is present. Model priority: qwen3-vl:latest → minicpm-v:latest → deepseek-ocr:latest (configurable via OLLAMA_OCR_MODEL env var). Timeout 180s. OCR text is fed into the same _parse_tracklist() pipeline as regular text files. music_enricher: extract_hints(use_ocr=not args.no_api) — OCR is skipped with --no-api to allow fully offline/fast runs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 03:08:21 +02:00 · 2026-04-29 03:08:21 +02:00 · 40a2ef3fb6
commit 40a2ef3fb6
parent 28f716f8f2
2 changed files with 91 additions and 2 deletions
--- a/hint_extractor.py
+++ b/hint_extractor.py
@ -1,7 +1,11 @@
 from __future__ import annotations
 import base64
 import json
 import os
 import re
 import sys
 import urllib.request
 from pathlib import Path
 from typing import Optional, List, Dict, Tuple
@ -224,6 +228,85 @@ def _read_tracklist_file(path: Path) -> Optional[str]:
    return None
 _OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
 # Modelle in Prioritätsreihenfolge; überschreibbar via OLLAMA_OCR_MODEL
 _OCR_MODELS = [m.strip() for m in os.getenv(
    "OLLAMA_OCR_MODEL",
    "qwen3-vl:latest,minicpm-v:latest,deepseek-ocr:latest"
 ).split(",") if m.strip()]
 _OCR_PROMPT = (
    "This image shows a CD album back cover or booklet page. "
    "Your task: extract the complete tracklist as plain text.\n"
    "Rules:\n"
    "- Output track number and title per line, e.g. '1. Title' or '1-1 Title'\n"
    "- If multiple discs/CDs: include a header like 'CD 1' or 'Disc 1' before each group\n"
    "- Include durations if visible (e.g. '1. Title  4:32')\n"
    "- Do NOT include label info, barcodes, or other non-tracklist text\n"
    "- If no tracklist is visible, reply with: NO_TRACKLIST"
 )
 def _ocr_back_cover(image_files: List[Path]) -> Optional[str]:
    """
    OCR eines Back-Cover- oder Booklet-Bildes via Ollama Vision.
    Gibt den erkannten Text zurück, oder None wenn nichts gefunden.
    """
    # Nur Bilder die nach Back-Cover aussehen
    candidates = [
        p for p in image_files
        if any(kw in p.name.lower() for kw in ("back", "inlay", "booklet", "inside", "rear"))
    ]
    # Fallback: alle Bilder außer dem Front-Cover
    if not candidates:
        candidates = [
            p for p in image_files
            if not any(kw in p.name.lower() for kw in ("front", "folder", "cover"))
        ]
    if not candidates:
        return None
    image_path = candidates[0]
    try:
        img_b64 = base64.b64encode(image_path.read_bytes()).decode()
    except Exception as e:
        print(f"  ⚠️ OCR-Bild lesen {image_path.name}: {e}", file=sys.stderr)
        return None
    for model in _OCR_MODELS:
        payload = json.dumps({
            "model": model,
            "messages": [{
                "role": "user",
                "content": _OCR_PROMPT,
                "images": [img_b64],
            }],
            "stream": False,
            "options": {"temperature": 0.0},
        }).encode()
        try:
            req = urllib.request.Request(
                f"{_OLLAMA_HOST}/api/chat",
                data=payload,
                headers={"Content-Type": "application/json"},
                method="POST",
            )
            with urllib.request.urlopen(req, timeout=180) as resp:
                data = json.loads(resp.read())
            text = data.get("message", {}).get("content", "").strip()
            if text and "NO_TRACKLIST" not in text:
                print(f"  📷 OCR {image_path.name} via {model}: {len(text)} Zeichen extrahiert",
                      file=sys.stderr)
                return text
            elif "NO_TRACKLIST" in text:
                print(f"  📷 OCR {image_path.name}: kein Tracklist-Text erkannt", file=sys.stderr)
                return None
        except Exception as e:
            print(f"  ⚠️ OCR-Fehler ({model}) {image_path.name}: {e}", file=sys.stderr)
            continue
    return None
 def _check_cover_images(paths: List[Path]) -> List[Path]:
    good: List[Path] = []
    for p in paths:
@ -236,7 +319,7 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
    return good
-def extract_hints(scan: AlbumScan) -> AlbumHints:
+def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
    hints = AlbumHints(album_dir=scan.album_dir)
    # Directory name
@ -253,6 +336,12 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
            texts.append(txt)
    hints.tracklist_text = "\n\n".join(texts) if texts else None
    # OCR-Fallback: Back-Cover scannen wenn keine Tracklist-Textdatei vorhanden
    if use_ocr and not hints.tracklist_text and scan.image_files:
        ocr_text = _ocr_back_cover(scan.image_files)
        if ocr_text:
            hints.tracklist_text = ocr_text
    parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
    # M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
--- a/music_enricher.py
+++ b/music_enricher.py
@ -90,7 +90,7 @@ def process_album(
            stats["skipped"] += 1
            return stats
-        hints = extract_hints(scan)
+        hints = extract_hints(scan, use_ocr=not args.no_api)
        proposal = resolve(
            hints,