Add YouTube ID detection and metadata lookup via yt-dlp
- Extract 11-char YouTube video IDs from audio filenames - Fetch title, uploader, chapters via yt-dlp (--dump-json) - Use chapters as tracklist when no .txt tracklist is available - Store yt_title / yt_uploader in AlbumHints for LLM prompt context - Fall back to YouTube video title as track title for single-file albums Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
888464b4d0
commit
b6abfae16c
3 changed files with 115 additions and 0 deletions
103
hint_extractor.py
Normal file → Executable file
103
hint_extractor.py
Normal file → Executable file
|
|
@ -4,6 +4,8 @@ import base64
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -319,6 +321,59 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
|
||||||
return good
|
return good
|
||||||
|
|
||||||
|
|
||||||
|
# YouTube-Video-ID: 11 Zeichen aus [A-Za-z0-9_-], eingebettet im Dateinamen
|
||||||
|
_YT_ID_RE = re.compile(r"(?<![A-Za-z0-9_-])([A-Za-z0-9_-]{11})(?![A-Za-z0-9_-])")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_youtube_id(path: Path) -> Optional[str]:
|
||||||
|
"""Sucht eine YouTube-Video-ID im Dateinamen (Stem oder Suffix)."""
|
||||||
|
name = path.stem + path.suffix
|
||||||
|
for m in _YT_ID_RE.finditer(name):
|
||||||
|
candidate = m.group(1)
|
||||||
|
# Einfache Plausibilitätsprüfung: muss gemischte Zeichen haben
|
||||||
|
if re.search(r"[A-Z]", candidate) and re.search(r"[0-9a-z]", candidate):
|
||||||
|
return candidate
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_youtube_metadata(video_id: str) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Ruft YouTube-Metadaten via yt-dlp ab (kein API-Key nötig).
|
||||||
|
Gibt Dict mit title, uploader, chapters, description zurück oder None.
|
||||||
|
"""
|
||||||
|
ytdlp = shutil.which("yt-dlp")
|
||||||
|
if not ytdlp:
|
||||||
|
return None
|
||||||
|
url = f"https://www.youtube.com/watch?v={video_id}"
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[ytdlp, "--dump-json", "--no-download", "--no-playlist", url],
|
||||||
|
capture_output=True, text=True, timeout=30,
|
||||||
|
)
|
||||||
|
if result.returncode != 0 or not result.stdout.strip():
|
||||||
|
return None
|
||||||
|
return json.loads(result.stdout)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ YouTube-Fehler ({video_id}): {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _chapters_to_tracklist_text(chapters: List[Dict]) -> str:
|
||||||
|
"""
|
||||||
|
Konvertiert yt-dlp-Chapters in Tracklist-Text der vom _parse_tracklist
|
||||||
|
verarbeitetet werden kann: '1. Titel MM:SS'
|
||||||
|
"""
|
||||||
|
lines = []
|
||||||
|
for i, ch in enumerate(chapters, 1):
|
||||||
|
title = ch.get("title", "").strip()
|
||||||
|
if not title or title.startswith("<Untitled"):
|
||||||
|
continue
|
||||||
|
secs = int(ch.get("start_time", 0))
|
||||||
|
mm, ss = divmod(secs, 60)
|
||||||
|
lines.append(f"{i}. {title} {mm}:{ss:02d}")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
|
def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
|
||||||
hints = AlbumHints(album_dir=scan.album_dir)
|
hints = AlbumHints(album_dir=scan.album_dir)
|
||||||
|
|
||||||
|
|
@ -342,6 +397,43 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
|
||||||
if ocr_text:
|
if ocr_text:
|
||||||
hints.tracklist_text = ocr_text
|
hints.tracklist_text = ocr_text
|
||||||
|
|
||||||
|
# YouTube-Lookup: IDs aus Dateinamen extrahieren, Metadaten per yt-dlp holen
|
||||||
|
yt_meta_by_id: Dict[str, Optional[Dict]] = {}
|
||||||
|
yt_ids_by_stem: Dict[str, str] = {} # stem (normalisiert) → youtube_id
|
||||||
|
|
||||||
|
for audio_path in scan.audio_files:
|
||||||
|
yt_id = _extract_youtube_id(audio_path)
|
||||||
|
if yt_id:
|
||||||
|
stem_key = _clean(audio_path.stem).casefold()
|
||||||
|
yt_ids_by_stem[stem_key] = yt_id
|
||||||
|
yt_meta_by_id.setdefault(yt_id, None)
|
||||||
|
|
||||||
|
if yt_meta_by_id:
|
||||||
|
print(f" 📺 YouTube-IDs gefunden: {', '.join(list(yt_meta_by_id.keys())[:5])}", file=sys.stderr)
|
||||||
|
for yt_id in list(yt_meta_by_id.keys())[:5]:
|
||||||
|
meta = _fetch_youtube_metadata(yt_id)
|
||||||
|
yt_meta_by_id[yt_id] = meta
|
||||||
|
|
||||||
|
# Chapters als Tracklist nutzen wenn noch keine vorhanden
|
||||||
|
if not hints.tracklist_text:
|
||||||
|
for yt_id, meta in yt_meta_by_id.items():
|
||||||
|
if meta and meta.get("chapters"):
|
||||||
|
chapter_text = _chapters_to_tracklist_text(meta["chapters"])
|
||||||
|
if chapter_text:
|
||||||
|
hints.tracklist_text = chapter_text
|
||||||
|
print(f" 📺 YouTube-Chapters als Tracklist: {len(meta['chapters'])} Tracks",
|
||||||
|
file=sys.stderr)
|
||||||
|
break
|
||||||
|
|
||||||
|
# Album-Level-Hints (erster erfolgreicher Treffer)
|
||||||
|
for yt_id, meta in yt_meta_by_id.items():
|
||||||
|
if meta:
|
||||||
|
hints.yt_title = (meta.get("title") or "").strip() or None
|
||||||
|
hints.yt_uploader = (
|
||||||
|
meta.get("uploader") or meta.get("channel") or ""
|
||||||
|
).strip() or None
|
||||||
|
break
|
||||||
|
|
||||||
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
|
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
|
||||||
|
|
||||||
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
|
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
|
||||||
|
|
@ -467,6 +559,17 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:
|
||||||
if stem_key in m3u_titles:
|
if stem_key in m3u_titles:
|
||||||
title = m3u_titles[stem_key]
|
title = m3u_titles[stem_key]
|
||||||
|
|
||||||
|
# YouTube-Titel als letzter Fallback (bei einzelner Datei = das ganze Video)
|
||||||
|
if not _is_good(title):
|
||||||
|
stem_key = _clean(audio_path.stem).casefold()
|
||||||
|
yt_id = yt_ids_by_stem.get(stem_key)
|
||||||
|
if yt_id:
|
||||||
|
meta = yt_meta_by_id.get(yt_id)
|
||||||
|
if meta:
|
||||||
|
yt_video_title = (meta.get("title") or "").strip()
|
||||||
|
if yt_video_title:
|
||||||
|
title = yt_video_title
|
||||||
|
|
||||||
hints.tracks.append(TrackHints(
|
hints.tracks.append(TrackHints(
|
||||||
path=audio_path,
|
path=audio_path,
|
||||||
track_number=track_num,
|
track_number=track_num,
|
||||||
|
|
|
||||||
10
metadata_resolver.py
Normal file → Executable file
10
metadata_resolver.py
Normal file → Executable file
|
|
@ -213,6 +213,8 @@ def _build_resolve_prompt(hints: AlbumHints, partial: Dict) -> str:
|
||||||
f"Hinweis Künstler/Titel (aus Verzeichnis, kann vertauscht oder falsch sein): "
|
f"Hinweis Künstler/Titel (aus Verzeichnis, kann vertauscht oder falsch sein): "
|
||||||
f"{hints.dir_artist or '?'} / {hints.dir_album or partial.get('album', '?')}\n"
|
f"{hints.dir_artist or '?'} / {hints.dir_album or partial.get('album', '?')}\n"
|
||||||
f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n"
|
f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n"
|
||||||
|
+ (f"YouTube-Videotitel: {hints.yt_title}\n" if hints.yt_title else "")
|
||||||
|
+ (f"YouTube-Uploader/Kanal: {hints.yt_uploader}\n" if hints.yt_uploader else "")
|
||||||
+ (f"Tracklist-Kopf (Label/Jahr/Albumtitel):\n{tracklist_header}\n\n" if tracklist_header else "")
|
+ (f"Tracklist-Kopf (Label/Jahr/Albumtitel):\n{tracklist_header}\n\n" if tracklist_header else "")
|
||||||
+ f"Tracks:\n{tracks_summary}\n\n"
|
+ f"Tracks:\n{tracks_summary}\n\n"
|
||||||
'Antworte NUR mit einem JSON-Objekt (null wenn unbekannt):\n'
|
'Antworte NUR mit einem JSON-Objekt (null wenn unbekannt):\n'
|
||||||
|
|
@ -354,9 +356,17 @@ def resolve(
|
||||||
genre = genre or t.existing_tags.get("genre")
|
genre = genre or t.existing_tags.get("genre")
|
||||||
label = label or t.existing_tags.get("label") or t.existing_tags.get("organization")
|
label = label or t.existing_tags.get("label") or t.existing_tags.get("organization")
|
||||||
|
|
||||||
|
# YouTube-Metadaten als zusätzliche Hinweise (Uploader → Künstler, Titel → Album/Track)
|
||||||
|
if hints.yt_uploader and not artist:
|
||||||
|
artist = hints.yt_uploader
|
||||||
|
if hints.yt_title and not album:
|
||||||
|
album = hints.yt_title
|
||||||
|
|
||||||
if artist or album:
|
if artist or album:
|
||||||
confidence += 0.05
|
confidence += 0.05
|
||||||
sources.append("local-hints")
|
sources.append("local-hints")
|
||||||
|
if hints.yt_title or hints.yt_uploader:
|
||||||
|
sources.append("youtube")
|
||||||
|
|
||||||
# AcoustID fingerprinting
|
# AcoustID fingerprinting
|
||||||
fp_mbids: Dict[str, List[str]] = {}
|
fp_mbids: Dict[str, List[str]] = {}
|
||||||
|
|
|
||||||
2
models.py
Normal file → Executable file
2
models.py
Normal file → Executable file
|
|
@ -50,6 +50,8 @@ class AlbumHints:
|
||||||
tracklist_text: Optional[str] = None # merged text from all tracklist files
|
tracklist_text: Optional[str] = None # merged text from all tracklist files
|
||||||
cover_images: List[Path] = field(default_factory=list)
|
cover_images: List[Path] = field(default_factory=list)
|
||||||
tracks: List[TrackHints] = field(default_factory=list)
|
tracks: List[TrackHints] = field(default_factory=list)
|
||||||
|
yt_title: Optional[str] = None # YouTube video title (if found)
|
||||||
|
yt_uploader: Optional[str] = None # YouTube channel/uploader name
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue