Robust tracklist matching: fuzzy titles, catalog numbers, correct disc/track

hint_extractor:
- _norm_for_match(): strips all non-alnum for punctuation-agnostic comparison
- _catalog_key(): extracts BWV/Op./K./HWV/... catalog number for matching
  (fixes abbreviated filenames like "Fantasia_Cm_BWV_562" vs "Fantasia In C Minor, BWV 562")
- Matching priority: exact number+disc → exact title → fuzzy title → catalog number
- Tracklist disc+track OVERRIDE M3U position when a match is found
  (M3U is only used as last fallback; fixes wrong alphabetical ordering)

metadata_resolver:
- LLM prompt now defines artist/albumartist roles explicitly
  (artist = composer for classical; albumartist = performer/interpreter)
- LLM albumartist can override dir_artist when confidence < 0.4
- _build_track_proposals: when track artist == albumartist (performer from filename),
  composer (album-level artist) is used as track artist instead
- Tracklist header (first lines before tracks) included in LLM prompt
  for label/year/album-title discovery
- import re added (was missing)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dieter Schlüter 2026-04-29 02:32:11 +02:00
commit d1391fc36a
2 changed files with 134 additions and 30 deletions

View file

@ -56,6 +56,26 @@ def _clean(s: Optional[str]) -> str:
return re.sub(r"\s+", " ", s.replace("_", " ")).strip(" -._") return re.sub(r"\s+", " ", s.replace("_", " ")).strip(" -._")
def _norm_for_match(s: str) -> str:
"""Nur Buchstaben und Ziffern — für fuzzy Titelvergleich (Interpunktion-agnostisch)."""
return re.sub(r"[^a-z0-9]", "", s.casefold())
# Klassische Werkverzeichnis-Nummern: BWV 565, Op. 27, K. 331, HWV 56, …
_CATALOG_RE = re.compile(
r"\b(bwv|hwv|op|k|kv|d|sz|wq|bbwv|rv|twv|hob)\W*(\d+[a-z]?(?:[\/\.]\d+)?)",
re.IGNORECASE,
)
def _catalog_key(s: str) -> Optional[str]:
"""Extrahiert normalisierte Katalognummer, z.B. 'bwv565' oder 'op27'."""
m = _CATALOG_RE.search(s)
if m:
return m.group(1).lower() + re.sub(r"\W", "", m.group(2))
return None
def _is_good(v: Optional[str]) -> bool: def _is_good(v: Optional[str]) -> bool:
if not v: if not v:
return False return False
@ -249,12 +269,21 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
except Exception as e: except Exception as e:
print(f" ⚠️ Playlist-Lesefehler {pf.name}: {e}", file=sys.stderr) print(f" ⚠️ Playlist-Lesefehler {pf.name}: {e}", file=sys.stderr)
# Tracklist-Lookup: normalisierter Titel → Eintrag (für titelbasiertes Matching) # Tracklist-Lookup: exakter Titel, fuzzy Titel, Katalognummer (BWV, Op., K., …)
tl_by_title: Dict[str, Dict[str, str]] = {} tl_by_title: Dict[str, Dict[str, str]] = {}
tl_by_title_norm: Dict[str, Dict[str, str]] = {}
tl_by_catalog: Dict[str, Dict[str, str]] = {}
for entry in parsed_tracklist: for entry in parsed_tracklist:
key = _clean(entry.get("title", "")).casefold() raw_title = entry.get("title", "")
if key: exact_key = _clean(raw_title).casefold()
tl_by_title[key] = entry if exact_key:
tl_by_title[exact_key] = entry
norm_key = _norm_for_match(raw_title)
if norm_key:
tl_by_title_norm[norm_key] = entry
cat_key = _catalog_key(raw_title)
if cat_key:
tl_by_catalog[cat_key] = entry
# Build TrackHints per audio file # Build TrackHints per audio file
for audio_path in sorted(scan.audio_files): for audio_path in sorted(scan.audio_files):
@ -274,12 +303,6 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
except ValueError: except ValueError:
pass pass
# Track number aus M3U-Reihenfolge (Vorrang vor Dateiname, aber nicht vor Tag)
if track_num is None:
stem_key = _clean(audio_path.stem).casefold()
if stem_key in m3u_order:
track_num = m3u_order[stem_key]
# Disc number: tag > filename > path segment # Disc number: tag > filename > path segment
raw_dn = tags.get("discnumber") or fn_hints.get("disc") raw_dn = tags.get("discnumber") or fn_hints.get("disc")
if raw_dn: if raw_dn:
@ -297,21 +320,55 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
title = tags.get("title") or fn_hints.get("title") title = tags.get("title") or fn_hints.get("title")
artist = tags.get("artist") or fn_hints.get("artist") artist = tags.get("artist") or fn_hints.get("artist")
# Tracklist: erst nach Nummer, dann nach Titel # Tracklist-Matching: Nummer → exakter Titel → fuzzy Titel
# Wenn ein Match gefunden: disc+track aus Tracklist übernehmen (Tracklist ist
# autoritativer als M3U-Reihenfolge bei Alben mit expliziter Disc-Nummerierung).
if parsed_tracklist: if parsed_tracklist:
matched_tl: Optional[Dict[str, str]] = None matched_tl: Optional[Dict[str, str]] = None
if track_num:
# 1. Exakt per Tracknummer + Disc (nur wenn beides aus Tag/Dateiname bekannt)
if track_num and disc_num:
for tl_entry in parsed_tracklist: for tl_entry in parsed_tracklist:
tl_track = tl_entry.get("track") tl_track = tl_entry.get("track")
tl_disc = tl_entry.get("disc", "1") tl_disc = tl_entry.get("disc", "1")
if (tl_track and int(tl_track) == track_num if (tl_track and int(tl_track) == track_num
and int(tl_disc) == (disc_num or 1)): and int(tl_disc) == disc_num):
matched_tl = tl_entry matched_tl = tl_entry
break break
# 2. Exakter Titelvergleich
if matched_tl is None and title: if matched_tl is None and title:
matched_tl = tl_by_title.get(_clean(title).casefold()) matched_tl = tl_by_title.get(_clean(title).casefold())
if matched_tl and not _is_good(title) and _is_good(matched_tl.get("title")):
title = matched_tl["title"] # 3. Fuzzy Titelvergleich (ignoriert Kommas, Apostrophe, Groß-/Kleinschreibung)
if matched_tl is None and title:
matched_tl = tl_by_title_norm.get(_norm_for_match(title))
# 4. Katalognummer (BWV, Op., K. …) — greift bei abgekürzten Dateinamen
if matched_tl is None and title:
cat = _catalog_key(title)
if cat:
matched_tl = tl_by_catalog.get(cat)
if matched_tl:
# Titel aus Tracklist übernehmen wenn besser
if _is_good(matched_tl.get("title")):
title = matched_tl["title"]
# disc+track aus Tracklist sind autoritativer als M3U-Reihenfolge
try:
tl_track_n = int(matched_tl["track"]) if matched_tl.get("track") else None
tl_disc_n = int(matched_tl.get("disc", "1"))
if tl_track_n:
track_num = tl_track_n
disc_num = tl_disc_n
except (ValueError, KeyError):
pass
# M3U-Reihenfolge nur als letzter Fallback (wenn Tracklist kein Match liefert)
if track_num is None:
stem_key = _clean(audio_path.stem).casefold()
if stem_key in m3u_order:
track_num = m3u_order[stem_key]
# M3U-Titel als Fallback (enthält "Composer - Title" — nur nutzen wenn kein besserer Titel) # M3U-Titel als Fallback (enthält "Composer - Title" — nur nutzen wenn kein besserer Titel)
if not _is_good(title): if not _is_good(title):

View file

@ -1,6 +1,7 @@
from __future__ import annotations from __future__ import annotations
import os import os
import re
import sys import sys
import time import time
from typing import Optional, List, Dict, Tuple from typing import Optional, List, Dict, Tuple
@ -179,20 +180,42 @@ def _discogs_search(artist: Optional[str], album: Optional[str]) -> Optional[Dic
def _build_resolve_prompt(hints: AlbumHints, partial: Dict) -> str: def _build_resolve_prompt(hints: AlbumHints, partial: Dict) -> str:
tracks_summary = "\n".join( tracks_summary = "\n".join(
f" - Track {t.track_number or '?'}: {t.title or t.path.stem}" f" - {('D'+str(t.disc_number)+'-') if t.disc_number else ''}T{t.track_number or '?'}: "
f"{t.title or t.path.stem}"
+ (f" [{t.artist}]" if t.artist else "") + (f" [{t.artist}]" if t.artist else "")
for t in hints.tracks[:20] for t in hints.tracks[:20]
) )
# Tracklist-Kopfzeilen (erste 400 Zeichen, vor der Track-Liste) für Album/Label-Info
tracklist_header = ""
if hints.tracklist_text:
header_lines = []
for line in hints.tracklist_text.splitlines():
line = line.strip()
if not line:
continue
# Stopp bei erster Zeile die wie ein Track aussieht (1-1, 1. etc.)
if re.match(r"^\d[\d\-]\s+\S", line) or re.match(r"^\d{1,3}[.)]\s+", line):
break
header_lines.append(line)
if sum(len(l) for l in header_lines) > 400:
break
tracklist_header = "\n".join(header_lines[:15])
return ( return (
"Du bist ein Musikexperte. Analysiere diese Album-Daten.\n" "Du bist ein Musikexperte. Analysiere diese Album-Daten und gib korrekte Metadaten zurück.\n"
"Vervollständige fehlende Felder UND korrigiere erkennbare Tippfehler " "Korrigiere auch erkennbare Tippfehler (Verzeichnisnamen enthalten oft Schreibfehler).\n\n"
"(z.B. im Albumtitel oder Künstlernamen — Verzeichnisnamen enthalten oft Schreibfehler).\n\n" "WICHTIGE FELDDEFINITIONEN:\n"
'- "artist" = Komponist (Klassik) ODER Band/Sänger (Pop/Rock/Jazz)\n'
'- "albumartist" = Interpret/Performer/Dirigent (Klassik) ODER gleich wie artist (Pop)\n'
" Beispiel Klassik: artist='Johann Sebastian Bach', albumartist='Peter Hurford'\n"
" Beispiel Pop: artist='ABBA', albumartist='ABBA'\n\n"
f"Verzeichnisname: {hints.album_dir.name}\n" f"Verzeichnisname: {hints.album_dir.name}\n"
f"Künstler (aus Verzeichnis): {hints.dir_artist or partial.get('artist', 'unbekannt')}\n" f"Hinweis Künstler/Titel (aus Verzeichnis, kann vertauscht oder falsch sein): "
f"Albumtitel (aus Verzeichnis, evtl. mit Tippfehlern): {hints.dir_album or partial.get('album', 'unbekannt')}\n" f"{hints.dir_artist or '?'} / {hints.dir_album or partial.get('album', '?')}\n"
f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n" f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n"
f"Tracklist-Hinweise:\n{tracks_summary}\n\n" + (f"Tracklist-Kopf (Label/Jahr/Albumtitel):\n{tracklist_header}\n\n" if tracklist_header else "")
'Antworte NUR mit einem JSON-Objekt mit diesen Feldern (null wenn unbekannt):\n' + f"Tracks:\n{tracks_summary}\n\n"
'Antworte NUR mit einem JSON-Objekt (null wenn unbekannt):\n'
'{"artist": ..., "album": ..., "albumartist": ..., "year": ..., "genre": ..., "label": ...}' '{"artist": ..., "album": ..., "albumartist": ..., "year": ..., "genre": ..., "label": ...}'
) )
@ -408,6 +431,7 @@ def resolve(
# LLM-Reasoning für verbleibende Lücken: # LLM-Reasoning für verbleibende Lücken:
# Reihenfolge: Ollama lokal → OpenRouter (DeepSeek, günstig) → Claude API # Reihenfolge: Ollama lokal → OpenRouter (DeepSeek, günstig) → Claude API
cl_albumartist: Optional[str] = None
partial = {"artist": artist, "album": album, "year": year} partial = {"artist": artist, "album": album, "year": year}
if use_claude and use_api: if use_claude and use_api:
if not artist or not album or confidence < 0.5: if not artist or not album or confidence < 0.5:
@ -427,18 +451,28 @@ def resolve(
year = year or cl.get("year") year = year or cl.get("year")
genre = genre or cl.get("genre") genre = genre or cl.get("genre")
label = label or cl.get("label") label = label or cl.get("label")
cl_albumartist = cl.get("albumartist") or None
confidence += 0.10 confidence += 0.10
sources.append("llm-resolve") sources.append("llm-resolve")
# Finalize albumartist # Finalize albumartist
# dir_artist hat Vorrang: wenn der Verzeichnisname einen Künstler nennt # Priorität: (1) LLM-albumartist bei niedriger Konfidenz
# (z.B. "Eugen_Cicero_-_Jazz_meets_Classic"), ist das der Albumkünstler — # (2) dir_artist wenn Verzeichnisname einen Künstler nennt
# auch wenn die Track-Dateinamen die Komponisten-Namen enthalten. # (3) Heuristiken (Various Artists, Mehrheitsabstimmung)
# Rationale: "Bach_Organ_-_Peter_Hurford" → dir_artist="Bach Organ" ist kein Künstler,
# aber der Verzeichnisname sieht aus wie Künstler; LLM kann das korrekt auflösen.
track_artists = [t.artist for t in hints.tracks if t.artist] track_artists = [t.artist for t in hints.tracks if t.artist]
from collections import Counter from collections import Counter
distinct_artists = set(a for a in track_artists if a) distinct_artists = set(a for a in track_artists if a)
if hints.dir_artist:
# Verzeichnisname nennt explizit einen Künstler → immer verwenden _bad_aa = {"various artists", "unknown artist", "unknown", "va"}
def _good_aa(s: Optional[str]) -> bool:
return bool(s) and s.casefold().strip() not in _bad_aa
if _good_aa(cl_albumartist) and confidence < 0.4:
# LLM kennt den echten Albumkünstler besser als der Verzeichnisname
albumartist = cl_albumartist # type: ignore[assignment]
elif hints.dir_artist:
albumartist = hints.dir_artist albumartist = hints.dir_artist
elif len(distinct_artists) >= 3: elif len(distinct_artists) >= 3:
albumartist = "Various Artists" albumartist = "Various Artists"
@ -452,7 +486,9 @@ def resolve(
confidence = min(confidence, 1.0) confidence = min(confidence, 1.0)
# Build track proposals # Build track proposals
track_proposals = _build_track_proposals(hints, mb_tracks, album, artist) # `artist` = Komponist/Hauptkünstler (LLM-aufgelöst), `albumartist` = Performer
# Werden beide weitergegeben damit _build_track_proposals richtig zuordnen kann.
track_proposals = _build_track_proposals(hints, mb_tracks, album, albumartist, composer=artist)
return AlbumProposal( return AlbumProposal(
album_dir=hints.album_dir, album_dir=hints.album_dir,
@ -476,15 +512,26 @@ def _build_track_proposals(
mb_tracks: Optional[List], mb_tracks: Optional[List],
album: str, album: str,
album_artist: str, album_artist: str,
composer: Optional[str] = None,
) -> List[TrackProposal]: ) -> List[TrackProposal]:
proposals: List[TrackProposal] = [] proposals: List[TrackProposal] = []
for th in sorted(hints.tracks, key=lambda t: (t.disc_number or 1, t.track_number or 9999, str(t.path))): for th in sorted(hints.tracks, key=lambda t: (t.disc_number or 1, t.track_number or 9999, str(t.path))):
title = th.title title = th.title
artist = th.artist or album_artist
track_num = th.track_number track_num = th.track_number
disc_num = th.disc_number disc_num = th.disc_number
# Klassik-Fall: Performer aus Dateiname, Komponist aus LLM
# Wenn th.artist == albumartist (Performer), und wir den Komponisten kennen,
# wird der Komponist als Track-Artist gesetzt → Filename: TT_-_Performer_-_Komponist_-_Werk
th_artist_cf = (th.artist or "").casefold().strip()
aa_cf = album_artist.casefold().strip()
if composer and th_artist_cf == aa_cf and th_artist_cf:
# Performer == albumartist → Komponist als Track-Artist
artist = composer
else:
artist = th.artist or album_artist
# Try to match from MusicBrainz track list # Try to match from MusicBrainz track list
if mb_tracks and track_num: if mb_tracks and track_num:
for mb_t in mb_tracks: for mb_t in mb_tracks: