Fix YouTube ID detection: use last _-token instead of broken lookbehind regex

The previous regex lookbehind (?<![A-Za-z0-9_-]) excluded _ as valid preceding character, so IDs after underscores were never matched. New approach: split stem by _ and check if the last token is an 11-char YouTube ID (mixed case + digit). Also strips the ID token from the stem before _parse_filename() to prevent it from leaking into the track title or being misread as an artist-title separator. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 05:57:27 +02:00 · 2026-04-29 05:57:27 +02:00 · 1960989eef
commit 1960989eef
parent f86db982a5
1 changed files with 24 additions and 14 deletions
--- a/hint_extractor.py
+++ b/hint_extractor.py
@ -321,18 +321,21 @@ def _check_cover_images(paths: List[Path]) -> List[Path]:
    return good


-# YouTube-Video-ID: 11 Zeichen aus [A-Za-z0-9_-], eingebettet im Dateinamen
-_YT_ID_RE = re.compile(r"(?<![A-Za-z0-9_-])([A-Za-z0-9_-]{11})(?![A-Za-z0-9_-])")
+# YouTube-Video-ID: exakt 11 Zeichen aus [A-Za-z0-9_-], typischerweise letztes _-Token
+_YT_ID_CHARS = re.compile(r"^[A-Za-z0-9_-]{11}$")


 def _extract_youtube_id(path: Path) -> Optional[str]:
-    """Sucht eine YouTube-Video-ID im Dateinamen (Stem oder Suffix)."""
-    name = path.stem + path.suffix
-    for m in _YT_ID_RE.finditer(name):
-        candidate = m.group(1)
-        # Einfache Plausibilitätsprüfung: muss gemischte Zeichen haben
-        if re.search(r"[A-Z]", candidate) and re.search(r"[0-9a-z]", candidate):
-            return candidate
+    """
+    Erkennt YouTube-Video-ID als letztes '_'-getrenntes Token im Dateinamen.
+    Plausibilitätsprüfung: mind. ein Großbuchstabe UND mind. ein Kleinbuchstabe/Ziffer.
+    """
+    candidate = path.stem.split("_")[-1]   # letztes Token nach Unterstrich
+    if (len(candidate) == 11
+            and _YT_ID_CHARS.match(candidate)
+            and re.search(r"[A-Z]", candidate)
+            and re.search(r"[0-9a-z]", candidate)):
+        return candidate
    return None


@ -470,8 +473,18 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:

    # Build TrackHints per audio file
    for audio_path in sorted(scan.audio_files):
+        stem_key = _clean(audio_path.stem).casefold()
+        yt_id_for_file = yt_ids_by_stem.get(stem_key)
+
+        # Stem ohne YouTube-ID für Dateiname-Parsing
+        parse_stem = audio_path.stem
+        if yt_id_for_file:
+            tokens = parse_stem.rsplit("_", 1)
+            if len(tokens) == 2 and tokens[1] == yt_id_for_file:
+                parse_stem = tokens[0]
+
        tags, duration = _read_tags(audio_path)
-        fn_hints = _parse_filename(audio_path.stem)
+        fn_hints = _parse_filename(parse_stem)

        track_num: Optional[int] = None
        disc_num: Optional[int] = None
@ -549,20 +562,17 @@ def extract_hints(scan: AlbumScan, use_ocr: bool = True) -> AlbumHints:

        # M3U-Reihenfolge nur als letzter Fallback (wenn Tracklist kein Match liefert)
        if track_num is None:
-            stem_key = _clean(audio_path.stem).casefold()
            if stem_key in m3u_order:
                track_num = m3u_order[stem_key]

        # M3U-Titel als Fallback (enthält "Composer - Title" — nur nutzen wenn kein besserer Titel)
        if not _is_good(title):
-            stem_key = _clean(audio_path.stem).casefold()
            if stem_key in m3u_titles:
                title = m3u_titles[stem_key]

        # YouTube-Titel als letzter Fallback (bei einzelner Datei = das ganze Video)
        if not _is_good(title):
-            stem_key = _clean(audio_path.stem).casefold()
-            yt_id = yt_ids_by_stem.get(stem_key)
+            yt_id = yt_id_for_file
            if yt_id:
                meta = yt_meta_by_id.get(yt_id)
                if meta: