Add YouTube ID detection and metadata lookup via yt-dlp

- Extract 11-char YouTube video IDs from audio filenames
- Fetch title, uploader, chapters via yt-dlp (--dump-json)
- Use chapters as tracklist when no .txt tracklist is available
- Store yt_title / yt_uploader in AlbumHints for LLM prompt context
- Fall back to YouTube video title as track title for single-file albums

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dieter Schlüter 2026-04-29 05:19:26 +02:00
commit b6abfae16c
3 changed files with 115 additions and 0 deletions

103
hint_extractor.py Normal file → Executable file
View file

@ -4,6 +4,8 @@ import base64
import json import json
import os import os
import re import re
import shutil
import subprocess
import sys import sys
import urllib.request import urllib.request
from pathlib import Path from pathlib import Path
@ -319,6 +321,59 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
return good return good
# YouTube-Video-ID: 11 Zeichen aus [A-Za-z0-9_-], eingebettet im Dateinamen
_YT_ID_RE = re.compile(r"(?<![A-Za-z0-9_-])([A-Za-z0-9_-]{11})(?![A-Za-z0-9_-])")
def _extract_youtube_id(path: Path) -> Optional[str]:
"""Sucht eine YouTube-Video-ID im Dateinamen (Stem oder Suffix)."""
name = path.stem + path.suffix
for m in _YT_ID_RE.finditer(name):
candidate = m.group(1)
# Einfache Plausibilitätsprüfung: muss gemischte Zeichen haben
if re.search(r"[A-Z]", candidate) and re.search(r"[0-9a-z]", candidate):
return candidate
return None
def _fetch_youtube_metadata(video_id: str) -> Optional[Dict]:
"""
Ruft YouTube-Metadaten via yt-dlp ab (kein API-Key nötig).
Gibt Dict mit title, uploader, chapters, description zurück oder None.
"""
ytdlp = shutil.which("yt-dlp")
if not ytdlp:
return None
url = f"https://www.youtube.com/watch?v={video_id}"
try:
result = subprocess.run(
[ytdlp, "--dump-json", "--no-download", "--no-playlist", url],
capture_output=True, text=True, timeout=30,
)
if result.returncode != 0 or not result.stdout.strip():
return None
return json.loads(result.stdout)
except Exception as e:
print(f" ⚠️ YouTube-Fehler ({video_id}): {e}", file=sys.stderr)
return None
def _chapters_to_tracklist_text(chapters: List[Dict]) -> str:
"""
Konvertiert yt-dlp-Chapters in Tracklist-Text der vom _parse_tracklist
verarbeitetet werden kann: '1. Titel MM:SS'
"""
lines = []
for i, ch in enumerate(chapters, 1):
title = ch.get("title", "").strip()
if not title or title.startswith("<Untitled"):
continue
secs = int(ch.get("start_time", 0))
mm, ss = divmod(secs, 60)
lines.append(f"{i}. {title} {mm}:{ss:02d}")
return "\n".join(lines)
def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints: def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
hints = AlbumHints(album_dir=scan.album_dir) hints = AlbumHints(album_dir=scan.album_dir)
@ -342,6 +397,43 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
if ocr_text: if ocr_text:
hints.tracklist_text = ocr_text hints.tracklist_text = ocr_text
# YouTube-Lookup: IDs aus Dateinamen extrahieren, Metadaten per yt-dlp holen
yt_meta_by_id: Dict[str, Optional[Dict]] = {}
yt_ids_by_stem: Dict[str, str] = {} # stem (normalisiert) → youtube_id
for audio_path in scan.audio_files:
yt_id = _extract_youtube_id(audio_path)
if yt_id:
stem_key = _clean(audio_path.stem).casefold()
yt_ids_by_stem[stem_key] = yt_id
yt_meta_by_id.setdefault(yt_id, None)
if yt_meta_by_id:
print(f" 📺 YouTube-IDs gefunden: {', '.join(list(yt_meta_by_id.keys())[:5])}", file=sys.stderr)
for yt_id in list(yt_meta_by_id.keys())[:5]:
meta = _fetch_youtube_metadata(yt_id)
yt_meta_by_id[yt_id] = meta
# Chapters als Tracklist nutzen wenn noch keine vorhanden
if not hints.tracklist_text:
for yt_id, meta in yt_meta_by_id.items():
if meta and meta.get("chapters"):
chapter_text = _chapters_to_tracklist_text(meta["chapters"])
if chapter_text:
hints.tracklist_text = chapter_text
print(f" 📺 YouTube-Chapters als Tracklist: {len(meta['chapters'])} Tracks",
file=sys.stderr)
break
# Album-Level-Hints (erster erfolgreicher Treffer)
for yt_id, meta in yt_meta_by_id.items():
if meta:
hints.yt_title = (meta.get("title") or "").strip() or None
hints.yt_uploader = (
meta.get("uploader") or meta.get("channel") or ""
).strip() or None
break
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else [] parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer # M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
@ -467,6 +559,17 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
if stem_key in m3u_titles: if stem_key in m3u_titles:
title = m3u_titles[stem_key] title = m3u_titles[stem_key]
# YouTube-Titel als letzter Fallback (bei einzelner Datei = das ganze Video)
if not _is_good(title):
stem_key = _clean(audio_path.stem).casefold()
yt_id = yt_ids_by_stem.get(stem_key)
if yt_id:
meta = yt_meta_by_id.get(yt_id)
if meta:
yt_video_title = (meta.get("title") or "").strip()
if yt_video_title:
title = yt_video_title
hints.tracks.append(TrackHints( hints.tracks.append(TrackHints(
path=audio_path, path=audio_path,
track_number=track_num, track_number=track_num,

10
metadata_resolver.py Normal file → Executable file
View file

@ -213,6 +213,8 @@ def _build_resolve_prompt(hints: AlbumHints, partial: Dict) -> str:
f"Hinweis Künstler/Titel (aus Verzeichnis, kann vertauscht oder falsch sein): " f"Hinweis Künstler/Titel (aus Verzeichnis, kann vertauscht oder falsch sein): "
f"{hints.dir_artist or '?'} / {hints.dir_album or partial.get('album', '?')}\n" f"{hints.dir_artist or '?'} / {hints.dir_album or partial.get('album', '?')}\n"
f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n" f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n"
+ (f"YouTube-Videotitel: {hints.yt_title}\n" if hints.yt_title else "")
+ (f"YouTube-Uploader/Kanal: {hints.yt_uploader}\n" if hints.yt_uploader else "")
+ (f"Tracklist-Kopf (Label/Jahr/Albumtitel):\n{tracklist_header}\n\n" if tracklist_header else "") + (f"Tracklist-Kopf (Label/Jahr/Albumtitel):\n{tracklist_header}\n\n" if tracklist_header else "")
+ f"Tracks:\n{tracks_summary}\n\n" + f"Tracks:\n{tracks_summary}\n\n"
'Antworte NUR mit einem JSON-Objekt (null wenn unbekannt):\n' 'Antworte NUR mit einem JSON-Objekt (null wenn unbekannt):\n'
@ -354,9 +356,17 @@ def resolve(
genre = genre or t.existing_tags.get("genre") genre = genre or t.existing_tags.get("genre")
label = label or t.existing_tags.get("label") or t.existing_tags.get("organization") label = label or t.existing_tags.get("label") or t.existing_tags.get("organization")
# YouTube-Metadaten als zusätzliche Hinweise (Uploader → Künstler, Titel → Album/Track)
if hints.yt_uploader and not artist:
artist = hints.yt_uploader
if hints.yt_title and not album:
album = hints.yt_title
if artist or album: if artist or album:
confidence += 0.05 confidence += 0.05
sources.append("local-hints") sources.append("local-hints")
if hints.yt_title or hints.yt_uploader:
sources.append("youtube")
# AcoustID fingerprinting # AcoustID fingerprinting
fp_mbids: Dict[str, List[str]] = {} fp_mbids: Dict[str, List[str]] = {}

2
models.py Normal file → Executable file
View file

@ -50,6 +50,8 @@ class AlbumHints:
tracklist_text: Optional[str] = None # merged text from all tracklist files tracklist_text: Optional[str] = None # merged text from all tracklist files
cover_images: List[Path] = field(default_factory=list) cover_images: List[Path] = field(default_factory=list)
tracks: List[TrackHints] = field(default_factory=list) tracks: List[TrackHints] = field(default_factory=list)
yt_title: Optional[str] = None # YouTube video title (if found)
yt_uploader: Optional[str] = None # YouTube channel/uploader name
@dataclass @dataclass