Add OCR fallback via Ollama Vision for albums without tracklist text

hint_extractor: _ocr_back_cover() sends back/inlay images to Ollama Vision
  when no tracklist .txt/.htm/.nfo is present. Model priority:
  qwen3-vl:latest → minicpm-v:latest → deepseek-ocr:latest (configurable
  via OLLAMA_OCR_MODEL env var). Timeout 180s. OCR text is fed into the
  same _parse_tracklist() pipeline as regular text files.

music_enricher: extract_hints(use_ocr=not args.no_api) — OCR is skipped
  with --no-api to allow fully offline/fast runs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dieter Schlüter 2026-04-29 03:08:21 +02:00
commit 40a2ef3fb6
2 changed files with 91 additions and 2 deletions

View file

@ -1,7 +1,11 @@
from __future__ import annotations from __future__ import annotations
import base64
import json
import os
import re import re
import sys import sys
import urllib.request
from pathlib import Path from pathlib import Path
from typing import Optional, List, Dict, Tuple from typing import Optional, List, Dict, Tuple
@ -224,6 +228,85 @@ def _read_tracklist_file(path: Path) -> Optional[str]:
return None return None
_OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
# Modelle in Prioritätsreihenfolge; überschreibbar via OLLAMA_OCR_MODEL
_OCR_MODELS = [m.strip() for m in os.getenv(
"OLLAMA_OCR_MODEL",
"qwen3-vl:latest,minicpm-v:latest,deepseek-ocr:latest"
).split(",") if m.strip()]
_OCR_PROMPT = (
"This image shows a CD album back cover or booklet page. "
"Your task: extract the complete tracklist as plain text.\n"
"Rules:\n"
"- Output track number and title per line, e.g. '1. Title' or '1-1 Title'\n"
"- If multiple discs/CDs: include a header like 'CD 1' or 'Disc 1' before each group\n"
"- Include durations if visible (e.g. '1. Title 4:32')\n"
"- Do NOT include label info, barcodes, or other non-tracklist text\n"
"- If no tracklist is visible, reply with: NO_TRACKLIST"
)
def _ocr_back_cover(image_files: List[Path]) -> Optional[str]:
"""
OCR eines Back-Cover- oder Booklet-Bildes via Ollama Vision.
Gibt den erkannten Text zurück, oder None wenn nichts gefunden.
"""
# Nur Bilder die nach Back-Cover aussehen
candidates = [
p for p in image_files
if any(kw in p.name.lower() for kw in ("back", "inlay", "booklet", "inside", "rear"))
]
# Fallback: alle Bilder außer dem Front-Cover
if not candidates:
candidates = [
p for p in image_files
if not any(kw in p.name.lower() for kw in ("front", "folder", "cover"))
]
if not candidates:
return None
image_path = candidates[0]
try:
img_b64 = base64.b64encode(image_path.read_bytes()).decode()
except Exception as e:
print(f" ⚠️ OCR-Bild lesen {image_path.name}: {e}", file=sys.stderr)
return None
for model in _OCR_MODELS:
payload = json.dumps({
"model": model,
"messages": [{
"role": "user",
"content": _OCR_PROMPT,
"images": [img_b64],
}],
"stream": False,
"options": {"temperature": 0.0},
}).encode()
try:
req = urllib.request.Request(
f"{_OLLAMA_HOST}/api/chat",
data=payload,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=180) as resp:
data = json.loads(resp.read())
text = data.get("message", {}).get("content", "").strip()
if text and "NO_TRACKLIST" not in text:
print(f" 📷 OCR {image_path.name} via {model}: {len(text)} Zeichen extrahiert",
file=sys.stderr)
return text
elif "NO_TRACKLIST" in text:
print(f" 📷 OCR {image_path.name}: kein Tracklist-Text erkannt", file=sys.stderr)
return None
except Exception as e:
print(f" ⚠️ OCR-Fehler ({model}) {image_path.name}: {e}", file=sys.stderr)
continue
return None
def _check_cover_images(paths: List[Path]) -> List[Path]: def _check_cover_images(paths: List[Path]) -> List[Path]:
good: List[Path] = [] good: List[Path] = []
for p in paths: for p in paths:
@ -236,7 +319,7 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
return good return good
def extract_hints(scan: AlbumScan) -> AlbumHints: def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
hints = AlbumHints(album_dir=scan.album_dir) hints = AlbumHints(album_dir=scan.album_dir)
# Directory name # Directory name
@ -253,6 +336,12 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
texts.append(txt) texts.append(txt)
hints.tracklist_text = "\n\n".join(texts) if texts else None hints.tracklist_text = "\n\n".join(texts) if texts else None
# OCR-Fallback: Back-Cover scannen wenn keine Tracklist-Textdatei vorhanden
if use_ocr and not hints.tracklist_text and scan.image_files:
ocr_text = _ocr_back_cover(scan.image_files)
if ocr_text:
hints.tracklist_text = ocr_text
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else [] parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer # M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer

View file

@ -90,7 +90,7 @@ def process_album(
stats["skipped"] += 1 stats["skipped"] += 1
return stats return stats
hints = extract_hints(scan) hints = extract_hints(scan, use_ocr=not args.no_api)
proposal = resolve( proposal = resolve(
hints, hints,