Add YouTube ID detection and metadata lookup via yt-dlp

- Extract 11-char YouTube video IDs from audio filenames - Fetch title, uploader, chapters via yt-dlp (--dump-json) - Use chapters as tracklist when no .txt tracklist is available - Store yt_title / yt_uploader in AlbumHints for LLM prompt context - Fall back to YouTube video title as track title for single-file albums Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 05:19:26 +02:00 · 2026-04-29 05:19:26 +02:00 · b6abfae16c
commit b6abfae16c
parent 888464b4d0
3 changed files with 115 additions and 0 deletions
--- a/hint_extractor.py
+++ b/hint_extractor.py
@ -4,6 +4,8 @@ import base64
 import json
 import os
 import re
+import shutil
+import subprocess
 import sys
 import urllib.request
 from pathlib import Path
@ -319,6 +321,59 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
    return good


+# YouTube-Video-ID: 11 Zeichen aus [A-Za-z0-9_-], eingebettet im Dateinamen
+_YT_ID_RE = re.compile(r"(?<![A-Za-z0-9_-])([A-Za-z0-9_-]{11})(?![A-Za-z0-9_-])")
+
+
+def _extract_youtube_id(path: Path) -> Optional[str]:
+    """Sucht eine YouTube-Video-ID im Dateinamen (Stem oder Suffix)."""
+    name = path.stem + path.suffix
+    for m in _YT_ID_RE.finditer(name):
+        candidate = m.group(1)
+        # Einfache Plausibilitätsprüfung: muss gemischte Zeichen haben
+        if re.search(r"[A-Z]", candidate) and re.search(r"[0-9a-z]", candidate):
+            return candidate
+    return None
+
+
+def _fetch_youtube_metadata(video_id: str) -> Optional[Dict]:
+    """
+    Ruft YouTube-Metadaten via yt-dlp ab (kein API-Key nötig).
+    Gibt Dict mit title, uploader, chapters, description zurück oder None.
+    """
+    ytdlp = shutil.which("yt-dlp")
+    if not ytdlp:
+        return None
+    url = f"https://www.youtube.com/watch?v={video_id}"
+    try:
+        result = subprocess.run(
+            [ytdlp, "--dump-json", "--no-download", "--no-playlist", url],
+            capture_output=True, text=True, timeout=30,
+        )
+        if result.returncode != 0 or not result.stdout.strip():
+            return None
+        return json.loads(result.stdout)
+    except Exception as e:
+        print(f"  ⚠️ YouTube-Fehler ({video_id}): {e}", file=sys.stderr)
+    return None
+
+
+def _chapters_to_tracklist_text(chapters: List[Dict]) -> str:
+    """
+    Konvertiert yt-dlp-Chapters in Tracklist-Text der vom _parse_tracklist
+    verarbeitetet werden kann: '1. Titel  MM:SS'
+    """
+    lines = []
+    for i, ch in enumerate(chapters, 1):
+        title = ch.get("title", "").strip()
+        if not title or title.startswith("<Untitled"):
+            continue
+        secs = int(ch.get("start_time", 0))
+        mm, ss = divmod(secs, 60)
+        lines.append(f"{i}. {title}  {mm}:{ss:02d}")
+    return "\n".join(lines)
+
+
 def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
    hints = AlbumHints(album_dir=scan.album_dir)

@ -342,6 +397,43 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
        if ocr_text:
            hints.tracklist_text = ocr_text

+    # YouTube-Lookup: IDs aus Dateinamen extrahieren, Metadaten per yt-dlp holen
+    yt_meta_by_id: Dict[str, Optional[Dict]] = {}
+    yt_ids_by_stem: Dict[str, str] = {}  # stem (normalisiert) → youtube_id
+
+    for audio_path in scan.audio_files:
+        yt_id = _extract_youtube_id(audio_path)
+        if yt_id:
+            stem_key = _clean(audio_path.stem).casefold()
+            yt_ids_by_stem[stem_key] = yt_id
+            yt_meta_by_id.setdefault(yt_id, None)
+
+    if yt_meta_by_id:
+        print(f"  📺 YouTube-IDs gefunden: {', '.join(list(yt_meta_by_id.keys())[:5])}", file=sys.stderr)
+        for yt_id in list(yt_meta_by_id.keys())[:5]:
+            meta = _fetch_youtube_metadata(yt_id)
+            yt_meta_by_id[yt_id] = meta
+
+        # Chapters als Tracklist nutzen wenn noch keine vorhanden
+        if not hints.tracklist_text:
+            for yt_id, meta in yt_meta_by_id.items():
+                if meta and meta.get("chapters"):
+                    chapter_text = _chapters_to_tracklist_text(meta["chapters"])
+                    if chapter_text:
+                        hints.tracklist_text = chapter_text
+                        print(f"  📺 YouTube-Chapters als Tracklist: {len(meta['chapters'])} Tracks",
+                              file=sys.stderr)
+                        break
+
+        # Album-Level-Hints (erster erfolgreicher Treffer)
+        for yt_id, meta in yt_meta_by_id.items():
+            if meta:
+                hints.yt_title = (meta.get("title") or "").strip() or None
+                hints.yt_uploader = (
+                    meta.get("uploader") or meta.get("channel") or ""
+                ).strip() or None
+                break
+
    parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []

    # M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
@ -467,6 +559,17 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
            if stem_key in m3u_titles:
                title = m3u_titles[stem_key]

+        # YouTube-Titel als letzter Fallback (bei einzelner Datei = das ganze Video)
+        if not _is_good(title):
+            stem_key = _clean(audio_path.stem).casefold()
+            yt_id = yt_ids_by_stem.get(stem_key)
+            if yt_id:
+                meta = yt_meta_by_id.get(yt_id)
+                if meta:
+                    yt_video_title = (meta.get("title") or "").strip()
+                    if yt_video_title:
+                        title = yt_video_title
+
        hints.tracks.append(TrackHints(
            path=audio_path,
            track_number=track_num,