Add OCR fallback via Ollama Vision for albums without tracklist text
hint_extractor: _ocr_back_cover() sends back/inlay images to Ollama Vision when no tracklist .txt/.htm/.nfo is present. Model priority: qwen3-vl:latest → minicpm-v:latest → deepseek-ocr:latest (configurable via OLLAMA_OCR_MODEL env var). Timeout 180s. OCR text is fed into the same _parse_tracklist() pipeline as regular text files. music_enricher: extract_hints(use_ocr=not args.no_api) — OCR is skipped with --no-api to allow fully offline/fast runs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
28f716f8f2
commit
40a2ef3fb6
2 changed files with 91 additions and 2 deletions
|
|
@ -1,7 +1,11 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
|
||||
|
|
@ -224,6 +228,85 @@ def _read_tracklist_file(path: Path) -> Optional[str]:
|
|||
return None
|
||||
|
||||
|
||||
_OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
||||
# Modelle in Prioritätsreihenfolge; überschreibbar via OLLAMA_OCR_MODEL
|
||||
_OCR_MODELS = [m.strip() for m in os.getenv(
|
||||
"OLLAMA_OCR_MODEL",
|
||||
"qwen3-vl:latest,minicpm-v:latest,deepseek-ocr:latest"
|
||||
).split(",") if m.strip()]
|
||||
|
||||
_OCR_PROMPT = (
|
||||
"This image shows a CD album back cover or booklet page. "
|
||||
"Your task: extract the complete tracklist as plain text.\n"
|
||||
"Rules:\n"
|
||||
"- Output track number and title per line, e.g. '1. Title' or '1-1 Title'\n"
|
||||
"- If multiple discs/CDs: include a header like 'CD 1' or 'Disc 1' before each group\n"
|
||||
"- Include durations if visible (e.g. '1. Title 4:32')\n"
|
||||
"- Do NOT include label info, barcodes, or other non-tracklist text\n"
|
||||
"- If no tracklist is visible, reply with: NO_TRACKLIST"
|
||||
)
|
||||
|
||||
|
||||
def _ocr_back_cover(image_files: List[Path]) -> Optional[str]:
|
||||
"""
|
||||
OCR eines Back-Cover- oder Booklet-Bildes via Ollama Vision.
|
||||
Gibt den erkannten Text zurück, oder None wenn nichts gefunden.
|
||||
"""
|
||||
# Nur Bilder die nach Back-Cover aussehen
|
||||
candidates = [
|
||||
p for p in image_files
|
||||
if any(kw in p.name.lower() for kw in ("back", "inlay", "booklet", "inside", "rear"))
|
||||
]
|
||||
# Fallback: alle Bilder außer dem Front-Cover
|
||||
if not candidates:
|
||||
candidates = [
|
||||
p for p in image_files
|
||||
if not any(kw in p.name.lower() for kw in ("front", "folder", "cover"))
|
||||
]
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
image_path = candidates[0]
|
||||
try:
|
||||
img_b64 = base64.b64encode(image_path.read_bytes()).decode()
|
||||
except Exception as e:
|
||||
print(f" ⚠️ OCR-Bild lesen {image_path.name}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
for model in _OCR_MODELS:
|
||||
payload = json.dumps({
|
||||
"model": model,
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": _OCR_PROMPT,
|
||||
"images": [img_b64],
|
||||
}],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.0},
|
||||
}).encode()
|
||||
try:
|
||||
req = urllib.request.Request(
|
||||
f"{_OLLAMA_HOST}/api/chat",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||
data = json.loads(resp.read())
|
||||
text = data.get("message", {}).get("content", "").strip()
|
||||
if text and "NO_TRACKLIST" not in text:
|
||||
print(f" 📷 OCR {image_path.name} via {model}: {len(text)} Zeichen extrahiert",
|
||||
file=sys.stderr)
|
||||
return text
|
||||
elif "NO_TRACKLIST" in text:
|
||||
print(f" 📷 OCR {image_path.name}: kein Tracklist-Text erkannt", file=sys.stderr)
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" ⚠️ OCR-Fehler ({model}) {image_path.name}: {e}", file=sys.stderr)
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
def _check_cover_images(paths: List[Path]) -> List[Path]:
|
||||
good: List[Path] = []
|
||||
for p in paths:
|
||||
|
|
@ -236,7 +319,7 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
|
|||
return good
|
||||
|
||||
|
||||
def extract_hints(scan: AlbumScan) -> AlbumHints:
|
||||
def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
|
||||
hints = AlbumHints(album_dir=scan.album_dir)
|
||||
|
||||
# Directory name
|
||||
|
|
@ -253,6 +336,12 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
|
|||
texts.append(txt)
|
||||
hints.tracklist_text = "\n\n".join(texts) if texts else None
|
||||
|
||||
# OCR-Fallback: Back-Cover scannen wenn keine Tracklist-Textdatei vorhanden
|
||||
if use_ocr and not hints.tracklist_text and scan.image_files:
|
||||
ocr_text = _ocr_back_cover(scan.image_files)
|
||||
if ocr_text:
|
||||
hints.tracklist_text = ocr_text
|
||||
|
||||
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
|
||||
|
||||
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue