Add OCR fallback via Ollama Vision for albums without tracklist text

hint_extractor: _ocr_back_cover() sends back/inlay images to Ollama Vision
  when no tracklist .txt/.htm/.nfo is present. Model priority:
  qwen3-vl:latest → minicpm-v:latest → deepseek-ocr:latest (configurable
  via OLLAMA_OCR_MODEL env var). Timeout 180s. OCR text is fed into the
  same _parse_tracklist() pipeline as regular text files.

music_enricher: extract_hints(use_ocr=not args.no_api) — OCR is skipped
  with --no-api to allow fully offline/fast runs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dieter Schlüter 2026-04-29 03:08:21 +02:00
commit 40a2ef3fb6
2 changed files with 91 additions and 2 deletions

View file

@ -1,7 +1,11 @@
from __future__ import annotations
import base64
import json
import os
import re
import sys
import urllib.request
from pathlib import Path
from typing import Optional, List, Dict, Tuple
@ -224,6 +228,85 @@ def _read_tracklist_file(path: Path) -> Optional[str]:
return None
_OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
# Modelle in Prioritätsreihenfolge; überschreibbar via OLLAMA_OCR_MODEL
_OCR_MODELS = [m.strip() for m in os.getenv(
"OLLAMA_OCR_MODEL",
"qwen3-vl:latest,minicpm-v:latest,deepseek-ocr:latest"
).split(",") if m.strip()]
_OCR_PROMPT = (
"This image shows a CD album back cover or booklet page. "
"Your task: extract the complete tracklist as plain text.\n"
"Rules:\n"
"- Output track number and title per line, e.g. '1. Title' or '1-1 Title'\n"
"- If multiple discs/CDs: include a header like 'CD 1' or 'Disc 1' before each group\n"
"- Include durations if visible (e.g. '1. Title 4:32')\n"
"- Do NOT include label info, barcodes, or other non-tracklist text\n"
"- If no tracklist is visible, reply with: NO_TRACKLIST"
)
def _ocr_back_cover(image_files: List[Path]) -> Optional[str]:
"""
OCR eines Back-Cover- oder Booklet-Bildes via Ollama Vision.
Gibt den erkannten Text zurück, oder None wenn nichts gefunden.
"""
# Nur Bilder die nach Back-Cover aussehen
candidates = [
p for p in image_files
if any(kw in p.name.lower() for kw in ("back", "inlay", "booklet", "inside", "rear"))
]
# Fallback: alle Bilder außer dem Front-Cover
if not candidates:
candidates = [
p for p in image_files
if not any(kw in p.name.lower() for kw in ("front", "folder", "cover"))
]
if not candidates:
return None
image_path = candidates[0]
try:
img_b64 = base64.b64encode(image_path.read_bytes()).decode()
except Exception as e:
print(f" ⚠️ OCR-Bild lesen {image_path.name}: {e}", file=sys.stderr)
return None
for model in _OCR_MODELS:
payload = json.dumps({
"model": model,
"messages": [{
"role": "user",
"content": _OCR_PROMPT,
"images": [img_b64],
}],
"stream": False,
"options": {"temperature": 0.0},
}).encode()
try:
req = urllib.request.Request(
f"{_OLLAMA_HOST}/api/chat",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=180) as resp:
data = json.loads(resp.read())
text = data.get("message", {}).get("content", "").strip()
if text and "NO_TRACKLIST" not in text:
print(f" 📷 OCR {image_path.name} via {model}: {len(text)} Zeichen extrahiert",
file=sys.stderr)
return text
elif "NO_TRACKLIST" in text:
print(f" 📷 OCR {image_path.name}: kein Tracklist-Text erkannt", file=sys.stderr)
return None
except Exception as e:
print(f" ⚠️ OCR-Fehler ({model}) {image_path.name}: {e}", file=sys.stderr)
continue
return None
def _check_cover_images(paths: List[Path]) -> List[Path]:
good: List[Path] = []
for p in paths:
@ -236,7 +319,7 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
return good
def extract_hints(scan: AlbumScan) -> AlbumHints:
def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
hints = AlbumHints(album_dir=scan.album_dir)
# Directory name
@ -253,6 +336,12 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
texts.append(txt)
hints.tracklist_text = "\n\n".join(texts) if texts else None
# OCR-Fallback: Back-Cover scannen wenn keine Tracklist-Textdatei vorhanden
if use_ocr and not hints.tracklist_text and scan.image_files:
ocr_text = _ocr_back_cover(scan.image_files)
if ocr_text:
hints.tracklist_text = ocr_text
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer

View file

@ -90,7 +90,7 @@ def process_album(
stats["skipped"] += 1
return stats
hints = extract_hints(scan)
hints = extract_hints(scan, use_ocr=not args.no_api)
proposal = resolve(
hints,