Add OCR fallback via Ollama Vision for albums without tracklist text
hint_extractor: _ocr_back_cover() sends back/inlay images to Ollama Vision when no tracklist .txt/.htm/.nfo is present. Model priority: qwen3-vl:latest → minicpm-v:latest → deepseek-ocr:latest (configurable via OLLAMA_OCR_MODEL env var). Timeout 180s. OCR text is fed into the same _parse_tracklist() pipeline as regular text files. music_enricher: extract_hints(use_ocr=not args.no_api) — OCR is skipped with --no-api to allow fully offline/fast runs. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
28f716f8f2
commit
40a2ef3fb6
2 changed files with 91 additions and 2 deletions
|
|
@ -1,7 +1,11 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import urllib.request
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, List, Dict, Tuple
|
from typing import Optional, List, Dict, Tuple
|
||||||
|
|
||||||
|
|
@ -224,6 +228,85 @@ def _read_tracklist_file(path: Path) -> Optional[str]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
|
||||||
|
# Modelle in Prioritätsreihenfolge; überschreibbar via OLLAMA_OCR_MODEL
|
||||||
|
_OCR_MODELS = [m.strip() for m in os.getenv(
|
||||||
|
"OLLAMA_OCR_MODEL",
|
||||||
|
"qwen3-vl:latest,minicpm-v:latest,deepseek-ocr:latest"
|
||||||
|
).split(",") if m.strip()]
|
||||||
|
|
||||||
|
_OCR_PROMPT = (
|
||||||
|
"This image shows a CD album back cover or booklet page. "
|
||||||
|
"Your task: extract the complete tracklist as plain text.\n"
|
||||||
|
"Rules:\n"
|
||||||
|
"- Output track number and title per line, e.g. '1. Title' or '1-1 Title'\n"
|
||||||
|
"- If multiple discs/CDs: include a header like 'CD 1' or 'Disc 1' before each group\n"
|
||||||
|
"- Include durations if visible (e.g. '1. Title 4:32')\n"
|
||||||
|
"- Do NOT include label info, barcodes, or other non-tracklist text\n"
|
||||||
|
"- If no tracklist is visible, reply with: NO_TRACKLIST"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _ocr_back_cover(image_files: List[Path]) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
OCR eines Back-Cover- oder Booklet-Bildes via Ollama Vision.
|
||||||
|
Gibt den erkannten Text zurück, oder None wenn nichts gefunden.
|
||||||
|
"""
|
||||||
|
# Nur Bilder die nach Back-Cover aussehen
|
||||||
|
candidates = [
|
||||||
|
p for p in image_files
|
||||||
|
if any(kw in p.name.lower() for kw in ("back", "inlay", "booklet", "inside", "rear"))
|
||||||
|
]
|
||||||
|
# Fallback: alle Bilder außer dem Front-Cover
|
||||||
|
if not candidates:
|
||||||
|
candidates = [
|
||||||
|
p for p in image_files
|
||||||
|
if not any(kw in p.name.lower() for kw in ("front", "folder", "cover"))
|
||||||
|
]
|
||||||
|
if not candidates:
|
||||||
|
return None
|
||||||
|
|
||||||
|
image_path = candidates[0]
|
||||||
|
try:
|
||||||
|
img_b64 = base64.b64encode(image_path.read_bytes()).decode()
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ OCR-Bild lesen {image_path.name}: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
for model in _OCR_MODELS:
|
||||||
|
payload = json.dumps({
|
||||||
|
"model": model,
|
||||||
|
"messages": [{
|
||||||
|
"role": "user",
|
||||||
|
"content": _OCR_PROMPT,
|
||||||
|
"images": [img_b64],
|
||||||
|
}],
|
||||||
|
"stream": False,
|
||||||
|
"options": {"temperature": 0.0},
|
||||||
|
}).encode()
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{_OLLAMA_HOST}/api/chat",
|
||||||
|
data=payload,
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||||
|
data = json.loads(resp.read())
|
||||||
|
text = data.get("message", {}).get("content", "").strip()
|
||||||
|
if text and "NO_TRACKLIST" not in text:
|
||||||
|
print(f" 📷 OCR {image_path.name} via {model}: {len(text)} Zeichen extrahiert",
|
||||||
|
file=sys.stderr)
|
||||||
|
return text
|
||||||
|
elif "NO_TRACKLIST" in text:
|
||||||
|
print(f" 📷 OCR {image_path.name}: kein Tracklist-Text erkannt", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ OCR-Fehler ({model}) {image_path.name}: {e}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _check_cover_images(paths: List[Path]) -> List[Path]:
|
def _check_cover_images(paths: List[Path]) -> List[Path]:
|
||||||
good: List[Path] = []
|
good: List[Path] = []
|
||||||
for p in paths:
|
for p in paths:
|
||||||
|
|
@ -236,7 +319,7 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
|
||||||
return good
|
return good
|
||||||
|
|
||||||
|
|
||||||
def extract_hints(scan: AlbumScan) -> AlbumHints:
|
def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
|
||||||
hints = AlbumHints(album_dir=scan.album_dir)
|
hints = AlbumHints(album_dir=scan.album_dir)
|
||||||
|
|
||||||
# Directory name
|
# Directory name
|
||||||
|
|
@ -253,6 +336,12 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
|
||||||
texts.append(txt)
|
texts.append(txt)
|
||||||
hints.tracklist_text = "\n\n".join(texts) if texts else None
|
hints.tracklist_text = "\n\n".join(texts) if texts else None
|
||||||
|
|
||||||
|
# OCR-Fallback: Back-Cover scannen wenn keine Tracklist-Textdatei vorhanden
|
||||||
|
if use_ocr and not hints.tracklist_text and scan.image_files:
|
||||||
|
ocr_text = _ocr_back_cover(scan.image_files)
|
||||||
|
if ocr_text:
|
||||||
|
hints.tracklist_text = ocr_text
|
||||||
|
|
||||||
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
|
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
|
||||||
|
|
||||||
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
|
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
|
||||||
|
|
|
||||||
|
|
@ -90,7 +90,7 @@ def process_album(
|
||||||
stats["skipped"] += 1
|
stats["skipped"] += 1
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
hints = extract_hints(scan)
|
hints = extract_hints(scan, use_ocr=not args.no_api)
|
||||||
|
|
||||||
proposal = resolve(
|
proposal = resolve(
|
||||||
hints,
|
hints,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue