From b6abfae16cca37354c90e4e29b4a6c0b5ec0a6c0 Mon Sep 17 00:00:00 2001 From: dschlueter Date: Wed, 29 Apr 2026 05:19:26 +0200 Subject: [PATCH] Add YouTube ID detection and metadata lookup via yt-dlp - Extract 11-char YouTube video IDs from audio filenames - Fetch title, uploader, chapters via yt-dlp (--dump-json) - Use chapters as tracklist when no .txt tracklist is available - Store yt_title / yt_uploader in AlbumHints for LLM prompt context - Fall back to YouTube video title as track title for single-file albums Co-Authored-By: Claude Sonnet 4.6 --- hint_extractor.py | 103 +++++++++++++++++++++++++++++++++++++++++++ metadata_resolver.py | 10 +++++ models.py | 2 + 3 files changed, 115 insertions(+) mode change 100644 => 100755 hint_extractor.py mode change 100644 => 100755 metadata_resolver.py mode change 100644 => 100755 models.py diff --git a/hint_extractor.py b/hint_extractor.py old mode 100644 new mode 100755 index 9a3117a..433c71d --- a/hint_extractor.py +++ b/hint_extractor.py @@ -4,6 +4,8 @@ import base64 import json import os import re +import shutil +import subprocess import sys import urllib.request from pathlib import Path @@ -319,6 +321,59 @@ def _check_cover_images(paths: List[Path]) -> List[Path]: return good +# YouTube-Video-ID: 11 Zeichen aus [A-Za-z0-9_-], eingebettet im Dateinamen +_YT_ID_RE = re.compile(r"(? Optional[str]: + """Sucht eine YouTube-Video-ID im Dateinamen (Stem oder Suffix).""" + name = path.stem + path.suffix + for m in _YT_ID_RE.finditer(name): + candidate = m.group(1) + # Einfache Plausibilitätsprüfung: muss gemischte Zeichen haben + if re.search(r"[A-Z]", candidate) and re.search(r"[0-9a-z]", candidate): + return candidate + return None + + +def _fetch_youtube_metadata(video_id: str) -> Optional[Dict]: + """ + Ruft YouTube-Metadaten via yt-dlp ab (kein API-Key nötig). + Gibt Dict mit title, uploader, chapters, description zurück oder None. + """ + ytdlp = shutil.which("yt-dlp") + if not ytdlp: + return None + url = f"https://www.youtube.com/watch?v={video_id}" + try: + result = subprocess.run( + [ytdlp, "--dump-json", "--no-download", "--no-playlist", url], + capture_output=True, text=True, timeout=30, + ) + if result.returncode != 0 or not result.stdout.strip(): + return None + return json.loads(result.stdout) + except Exception as e: + print(f" ⚠️ YouTube-Fehler ({video_id}): {e}", file=sys.stderr) + return None + + +def _chapters_to_tracklist_text(chapters: List[Dict]) -> str: + """ + Konvertiert yt-dlp-Chapters in Tracklist-Text der vom _parse_tracklist + verarbeitetet werden kann: '1. Titel MM:SS' + """ + lines = [] + for i, ch in enumerate(chapters, 1): + title = ch.get("title", "").strip() + if not title or title.startswith(" AlbumHints: hints = AlbumHints(album_dir=scan.album_dir) @@ -342,6 +397,43 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints: if ocr_text: hints.tracklist_text = ocr_text + # YouTube-Lookup: IDs aus Dateinamen extrahieren, Metadaten per yt-dlp holen + yt_meta_by_id: Dict[str, Optional[Dict]] = {} + yt_ids_by_stem: Dict[str, str] = {} # stem (normalisiert) → youtube_id + + for audio_path in scan.audio_files: + yt_id = _extract_youtube_id(audio_path) + if yt_id: + stem_key = _clean(audio_path.stem).casefold() + yt_ids_by_stem[stem_key] = yt_id + yt_meta_by_id.setdefault(yt_id, None) + + if yt_meta_by_id: + print(f" 📺 YouTube-IDs gefunden: {', '.join(list(yt_meta_by_id.keys())[:5])}", file=sys.stderr) + for yt_id in list(yt_meta_by_id.keys())[:5]: + meta = _fetch_youtube_metadata(yt_id) + yt_meta_by_id[yt_id] = meta + + # Chapters als Tracklist nutzen wenn noch keine vorhanden + if not hints.tracklist_text: + for yt_id, meta in yt_meta_by_id.items(): + if meta and meta.get("chapters"): + chapter_text = _chapters_to_tracklist_text(meta["chapters"]) + if chapter_text: + hints.tracklist_text = chapter_text + print(f" 📺 YouTube-Chapters als Tracklist: {len(meta['chapters'])} Tracks", + file=sys.stderr) + break + + # Album-Level-Hints (erster erfolgreicher Treffer) + for yt_id, meta in yt_meta_by_id.items(): + if meta: + hints.yt_title = (meta.get("title") or "").strip() or None + hints.yt_uploader = ( + meta.get("uploader") or meta.get("channel") or "" + ).strip() or None + break + parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else [] # M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer @@ -467,6 +559,17 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints: if stem_key in m3u_titles: title = m3u_titles[stem_key] + # YouTube-Titel als letzter Fallback (bei einzelner Datei = das ganze Video) + if not _is_good(title): + stem_key = _clean(audio_path.stem).casefold() + yt_id = yt_ids_by_stem.get(stem_key) + if yt_id: + meta = yt_meta_by_id.get(yt_id) + if meta: + yt_video_title = (meta.get("title") or "").strip() + if yt_video_title: + title = yt_video_title + hints.tracks.append(TrackHints( path=audio_path, track_number=track_num, diff --git a/metadata_resolver.py b/metadata_resolver.py old mode 100644 new mode 100755 index 804c38c..f109d04 --- a/metadata_resolver.py +++ b/metadata_resolver.py @@ -213,6 +213,8 @@ def _build_resolve_prompt(hints: AlbumHints, partial: Dict) -> str: f"Hinweis Künstler/Titel (aus Verzeichnis, kann vertauscht oder falsch sein): " f"{hints.dir_artist or '?'} / {hints.dir_album or partial.get('album', '?')}\n" f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n" + + (f"YouTube-Videotitel: {hints.yt_title}\n" if hints.yt_title else "") + + (f"YouTube-Uploader/Kanal: {hints.yt_uploader}\n" if hints.yt_uploader else "") + (f"Tracklist-Kopf (Label/Jahr/Albumtitel):\n{tracklist_header}\n\n" if tracklist_header else "") + f"Tracks:\n{tracks_summary}\n\n" 'Antworte NUR mit einem JSON-Objekt (null wenn unbekannt):\n' @@ -354,9 +356,17 @@ def resolve( genre = genre or t.existing_tags.get("genre") label = label or t.existing_tags.get("label") or t.existing_tags.get("organization") + # YouTube-Metadaten als zusätzliche Hinweise (Uploader → Künstler, Titel → Album/Track) + if hints.yt_uploader and not artist: + artist = hints.yt_uploader + if hints.yt_title and not album: + album = hints.yt_title + if artist or album: confidence += 0.05 sources.append("local-hints") + if hints.yt_title or hints.yt_uploader: + sources.append("youtube") # AcoustID fingerprinting fp_mbids: Dict[str, List[str]] = {} diff --git a/models.py b/models.py old mode 100644 new mode 100755 index f0b7f3a..a95662f --- a/models.py +++ b/models.py @@ -50,6 +50,8 @@ class AlbumHints: tracklist_text: Optional[str] = None # merged text from all tracklist files cover_images: List[Path] = field(default_factory=list) tracks: List[TrackHints] = field(default_factory=list) + yt_title: Optional[str] = None # YouTube video title (if found) + yt_uploader: Optional[str] = None # YouTube channel/uploader name @dataclass