Robust tracklist matching: fuzzy titles, catalog numbers, correct disc/track
hint_extractor: - _norm_for_match(): strips all non-alnum for punctuation-agnostic comparison - _catalog_key(): extracts BWV/Op./K./HWV/... catalog number for matching (fixes abbreviated filenames like "Fantasia_Cm_BWV_562" vs "Fantasia In C Minor, BWV 562") - Matching priority: exact number+disc → exact title → fuzzy title → catalog number - Tracklist disc+track OVERRIDE M3U position when a match is found (M3U is only used as last fallback; fixes wrong alphabetical ordering) metadata_resolver: - LLM prompt now defines artist/albumartist roles explicitly (artist = composer for classical; albumartist = performer/interpreter) - LLM albumartist can override dir_artist when confidence < 0.4 - _build_track_proposals: when track artist == albumartist (performer from filename), composer (album-level artist) is used as track artist instead - Tracklist header (first lines before tracks) included in LLM prompt for label/year/album-title discovery - import re added (was missing) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5011cef4db
commit
d1391fc36a
2 changed files with 134 additions and 30 deletions
|
|
@ -56,6 +56,26 @@ def _clean(s: Optional[str]) -> str:
|
||||||
return re.sub(r"\s+", " ", s.replace("_", " ")).strip(" -._")
|
return re.sub(r"\s+", " ", s.replace("_", " ")).strip(" -._")
|
||||||
|
|
||||||
|
|
||||||
|
def _norm_for_match(s: str) -> str:
|
||||||
|
"""Nur Buchstaben und Ziffern — für fuzzy Titelvergleich (Interpunktion-agnostisch)."""
|
||||||
|
return re.sub(r"[^a-z0-9]", "", s.casefold())
|
||||||
|
|
||||||
|
|
||||||
|
# Klassische Werkverzeichnis-Nummern: BWV 565, Op. 27, K. 331, HWV 56, …
|
||||||
|
_CATALOG_RE = re.compile(
|
||||||
|
r"\b(bwv|hwv|op|k|kv|d|sz|wq|bbwv|rv|twv|hob)\W*(\d+[a-z]?(?:[\/\.]\d+)?)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _catalog_key(s: str) -> Optional[str]:
|
||||||
|
"""Extrahiert normalisierte Katalognummer, z.B. 'bwv565' oder 'op27'."""
|
||||||
|
m = _CATALOG_RE.search(s)
|
||||||
|
if m:
|
||||||
|
return m.group(1).lower() + re.sub(r"\W", "", m.group(2))
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _is_good(v: Optional[str]) -> bool:
|
def _is_good(v: Optional[str]) -> bool:
|
||||||
if not v:
|
if not v:
|
||||||
return False
|
return False
|
||||||
|
|
@ -249,12 +269,21 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ⚠️ Playlist-Lesefehler {pf.name}: {e}", file=sys.stderr)
|
print(f" ⚠️ Playlist-Lesefehler {pf.name}: {e}", file=sys.stderr)
|
||||||
|
|
||||||
# Tracklist-Lookup: normalisierter Titel → Eintrag (für titelbasiertes Matching)
|
# Tracklist-Lookup: exakter Titel, fuzzy Titel, Katalognummer (BWV, Op., K., …)
|
||||||
tl_by_title: Dict[str, Dict[str, str]] = {}
|
tl_by_title: Dict[str, Dict[str, str]] = {}
|
||||||
|
tl_by_title_norm: Dict[str, Dict[str, str]] = {}
|
||||||
|
tl_by_catalog: Dict[str, Dict[str, str]] = {}
|
||||||
for entry in parsed_tracklist:
|
for entry in parsed_tracklist:
|
||||||
key = _clean(entry.get("title", "")).casefold()
|
raw_title = entry.get("title", "")
|
||||||
if key:
|
exact_key = _clean(raw_title).casefold()
|
||||||
tl_by_title[key] = entry
|
if exact_key:
|
||||||
|
tl_by_title[exact_key] = entry
|
||||||
|
norm_key = _norm_for_match(raw_title)
|
||||||
|
if norm_key:
|
||||||
|
tl_by_title_norm[norm_key] = entry
|
||||||
|
cat_key = _catalog_key(raw_title)
|
||||||
|
if cat_key:
|
||||||
|
tl_by_catalog[cat_key] = entry
|
||||||
|
|
||||||
# Build TrackHints per audio file
|
# Build TrackHints per audio file
|
||||||
for audio_path in sorted(scan.audio_files):
|
for audio_path in sorted(scan.audio_files):
|
||||||
|
|
@ -274,12 +303,6 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Track number aus M3U-Reihenfolge (Vorrang vor Dateiname, aber nicht vor Tag)
|
|
||||||
if track_num is None:
|
|
||||||
stem_key = _clean(audio_path.stem).casefold()
|
|
||||||
if stem_key in m3u_order:
|
|
||||||
track_num = m3u_order[stem_key]
|
|
||||||
|
|
||||||
# Disc number: tag > filename > path segment
|
# Disc number: tag > filename > path segment
|
||||||
raw_dn = tags.get("discnumber") or fn_hints.get("disc")
|
raw_dn = tags.get("discnumber") or fn_hints.get("disc")
|
||||||
if raw_dn:
|
if raw_dn:
|
||||||
|
|
@ -297,21 +320,55 @@ def extract_hints(scan: AlbumScan) -> AlbumHints:
|
||||||
title = tags.get("title") or fn_hints.get("title")
|
title = tags.get("title") or fn_hints.get("title")
|
||||||
artist = tags.get("artist") or fn_hints.get("artist")
|
artist = tags.get("artist") or fn_hints.get("artist")
|
||||||
|
|
||||||
# Tracklist: erst nach Nummer, dann nach Titel
|
# Tracklist-Matching: Nummer → exakter Titel → fuzzy Titel
|
||||||
|
# Wenn ein Match gefunden: disc+track aus Tracklist übernehmen (Tracklist ist
|
||||||
|
# autoritativer als M3U-Reihenfolge bei Alben mit expliziter Disc-Nummerierung).
|
||||||
if parsed_tracklist:
|
if parsed_tracklist:
|
||||||
matched_tl: Optional[Dict[str, str]] = None
|
matched_tl: Optional[Dict[str, str]] = None
|
||||||
if track_num:
|
|
||||||
|
# 1. Exakt per Tracknummer + Disc (nur wenn beides aus Tag/Dateiname bekannt)
|
||||||
|
if track_num and disc_num:
|
||||||
for tl_entry in parsed_tracklist:
|
for tl_entry in parsed_tracklist:
|
||||||
tl_track = tl_entry.get("track")
|
tl_track = tl_entry.get("track")
|
||||||
tl_disc = tl_entry.get("disc", "1")
|
tl_disc = tl_entry.get("disc", "1")
|
||||||
if (tl_track and int(tl_track) == track_num
|
if (tl_track and int(tl_track) == track_num
|
||||||
and int(tl_disc) == (disc_num or 1)):
|
and int(tl_disc) == disc_num):
|
||||||
matched_tl = tl_entry
|
matched_tl = tl_entry
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# 2. Exakter Titelvergleich
|
||||||
if matched_tl is None and title:
|
if matched_tl is None and title:
|
||||||
matched_tl = tl_by_title.get(_clean(title).casefold())
|
matched_tl = tl_by_title.get(_clean(title).casefold())
|
||||||
if matched_tl and not _is_good(title) and _is_good(matched_tl.get("title")):
|
|
||||||
title = matched_tl["title"]
|
# 3. Fuzzy Titelvergleich (ignoriert Kommas, Apostrophe, Groß-/Kleinschreibung)
|
||||||
|
if matched_tl is None and title:
|
||||||
|
matched_tl = tl_by_title_norm.get(_norm_for_match(title))
|
||||||
|
|
||||||
|
# 4. Katalognummer (BWV, Op., K. …) — greift bei abgekürzten Dateinamen
|
||||||
|
if matched_tl is None and title:
|
||||||
|
cat = _catalog_key(title)
|
||||||
|
if cat:
|
||||||
|
matched_tl = tl_by_catalog.get(cat)
|
||||||
|
|
||||||
|
if matched_tl:
|
||||||
|
# Titel aus Tracklist übernehmen wenn besser
|
||||||
|
if _is_good(matched_tl.get("title")):
|
||||||
|
title = matched_tl["title"]
|
||||||
|
# disc+track aus Tracklist sind autoritativer als M3U-Reihenfolge
|
||||||
|
try:
|
||||||
|
tl_track_n = int(matched_tl["track"]) if matched_tl.get("track") else None
|
||||||
|
tl_disc_n = int(matched_tl.get("disc", "1"))
|
||||||
|
if tl_track_n:
|
||||||
|
track_num = tl_track_n
|
||||||
|
disc_num = tl_disc_n
|
||||||
|
except (ValueError, KeyError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# M3U-Reihenfolge nur als letzter Fallback (wenn Tracklist kein Match liefert)
|
||||||
|
if track_num is None:
|
||||||
|
stem_key = _clean(audio_path.stem).casefold()
|
||||||
|
if stem_key in m3u_order:
|
||||||
|
track_num = m3u_order[stem_key]
|
||||||
|
|
||||||
# M3U-Titel als Fallback (enthält "Composer - Title" — nur nutzen wenn kein besserer Titel)
|
# M3U-Titel als Fallback (enthält "Composer - Title" — nur nutzen wenn kein besserer Titel)
|
||||||
if not _is_good(title):
|
if not _is_good(title):
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from typing import Optional, List, Dict, Tuple
|
from typing import Optional, List, Dict, Tuple
|
||||||
|
|
@ -179,20 +180,42 @@ def _discogs_search(artist: Optional[str], album: Optional[str]) -> Optional[Dic
|
||||||
|
|
||||||
def _build_resolve_prompt(hints: AlbumHints, partial: Dict) -> str:
|
def _build_resolve_prompt(hints: AlbumHints, partial: Dict) -> str:
|
||||||
tracks_summary = "\n".join(
|
tracks_summary = "\n".join(
|
||||||
f" - Track {t.track_number or '?'}: {t.title or t.path.stem}"
|
f" - {('D'+str(t.disc_number)+'-') if t.disc_number else ''}T{t.track_number or '?'}: "
|
||||||
|
f"{t.title or t.path.stem}"
|
||||||
+ (f" [{t.artist}]" if t.artist else "")
|
+ (f" [{t.artist}]" if t.artist else "")
|
||||||
for t in hints.tracks[:20]
|
for t in hints.tracks[:20]
|
||||||
)
|
)
|
||||||
|
# Tracklist-Kopfzeilen (erste 400 Zeichen, vor der Track-Liste) für Album/Label-Info
|
||||||
|
tracklist_header = ""
|
||||||
|
if hints.tracklist_text:
|
||||||
|
header_lines = []
|
||||||
|
for line in hints.tracklist_text.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
# Stopp bei erster Zeile die wie ein Track aussieht (1-1, 1. etc.)
|
||||||
|
if re.match(r"^\d[\d\-]\s+\S", line) or re.match(r"^\d{1,3}[.)]\s+", line):
|
||||||
|
break
|
||||||
|
header_lines.append(line)
|
||||||
|
if sum(len(l) for l in header_lines) > 400:
|
||||||
|
break
|
||||||
|
tracklist_header = "\n".join(header_lines[:15])
|
||||||
|
|
||||||
return (
|
return (
|
||||||
"Du bist ein Musikexperte. Analysiere diese Album-Daten.\n"
|
"Du bist ein Musikexperte. Analysiere diese Album-Daten und gib korrekte Metadaten zurück.\n"
|
||||||
"Vervollständige fehlende Felder UND korrigiere erkennbare Tippfehler "
|
"Korrigiere auch erkennbare Tippfehler (Verzeichnisnamen enthalten oft Schreibfehler).\n\n"
|
||||||
"(z.B. im Albumtitel oder Künstlernamen — Verzeichnisnamen enthalten oft Schreibfehler).\n\n"
|
"WICHTIGE FELDDEFINITIONEN:\n"
|
||||||
|
'- "artist" = Komponist (Klassik) ODER Band/Sänger (Pop/Rock/Jazz)\n'
|
||||||
|
'- "albumartist" = Interpret/Performer/Dirigent (Klassik) ODER gleich wie artist (Pop)\n'
|
||||||
|
" Beispiel Klassik: artist='Johann Sebastian Bach', albumartist='Peter Hurford'\n"
|
||||||
|
" Beispiel Pop: artist='ABBA', albumartist='ABBA'\n\n"
|
||||||
f"Verzeichnisname: {hints.album_dir.name}\n"
|
f"Verzeichnisname: {hints.album_dir.name}\n"
|
||||||
f"Künstler (aus Verzeichnis): {hints.dir_artist or partial.get('artist', 'unbekannt')}\n"
|
f"Hinweis Künstler/Titel (aus Verzeichnis, kann vertauscht oder falsch sein): "
|
||||||
f"Albumtitel (aus Verzeichnis, evtl. mit Tippfehlern): {hints.dir_album or partial.get('album', 'unbekannt')}\n"
|
f"{hints.dir_artist or '?'} / {hints.dir_album or partial.get('album', '?')}\n"
|
||||||
f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n"
|
f"Jahr: {hints.dir_year or partial.get('year', 'unbekannt')}\n"
|
||||||
f"Tracklist-Hinweise:\n{tracks_summary}\n\n"
|
+ (f"Tracklist-Kopf (Label/Jahr/Albumtitel):\n{tracklist_header}\n\n" if tracklist_header else "")
|
||||||
'Antworte NUR mit einem JSON-Objekt mit diesen Feldern (null wenn unbekannt):\n'
|
+ f"Tracks:\n{tracks_summary}\n\n"
|
||||||
|
'Antworte NUR mit einem JSON-Objekt (null wenn unbekannt):\n'
|
||||||
'{"artist": ..., "album": ..., "albumartist": ..., "year": ..., "genre": ..., "label": ...}'
|
'{"artist": ..., "album": ..., "albumartist": ..., "year": ..., "genre": ..., "label": ...}'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -408,6 +431,7 @@ def resolve(
|
||||||
|
|
||||||
# LLM-Reasoning für verbleibende Lücken:
|
# LLM-Reasoning für verbleibende Lücken:
|
||||||
# Reihenfolge: Ollama lokal → OpenRouter (DeepSeek, günstig) → Claude API
|
# Reihenfolge: Ollama lokal → OpenRouter (DeepSeek, günstig) → Claude API
|
||||||
|
cl_albumartist: Optional[str] = None
|
||||||
partial = {"artist": artist, "album": album, "year": year}
|
partial = {"artist": artist, "album": album, "year": year}
|
||||||
if use_claude and use_api:
|
if use_claude and use_api:
|
||||||
if not artist or not album or confidence < 0.5:
|
if not artist or not album or confidence < 0.5:
|
||||||
|
|
@ -427,18 +451,28 @@ def resolve(
|
||||||
year = year or cl.get("year")
|
year = year or cl.get("year")
|
||||||
genre = genre or cl.get("genre")
|
genre = genre or cl.get("genre")
|
||||||
label = label or cl.get("label")
|
label = label or cl.get("label")
|
||||||
|
cl_albumartist = cl.get("albumartist") or None
|
||||||
confidence += 0.10
|
confidence += 0.10
|
||||||
sources.append("llm-resolve")
|
sources.append("llm-resolve")
|
||||||
|
|
||||||
# Finalize albumartist
|
# Finalize albumartist
|
||||||
# dir_artist hat Vorrang: wenn der Verzeichnisname einen Künstler nennt
|
# Priorität: (1) LLM-albumartist bei niedriger Konfidenz
|
||||||
# (z.B. "Eugen_Cicero_-_Jazz_meets_Classic"), ist das der Albumkünstler —
|
# (2) dir_artist wenn Verzeichnisname einen Künstler nennt
|
||||||
# auch wenn die Track-Dateinamen die Komponisten-Namen enthalten.
|
# (3) Heuristiken (Various Artists, Mehrheitsabstimmung)
|
||||||
|
# Rationale: "Bach_Organ_-_Peter_Hurford" → dir_artist="Bach Organ" ist kein Künstler,
|
||||||
|
# aber der Verzeichnisname sieht aus wie Künstler; LLM kann das korrekt auflösen.
|
||||||
track_artists = [t.artist for t in hints.tracks if t.artist]
|
track_artists = [t.artist for t in hints.tracks if t.artist]
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
distinct_artists = set(a for a in track_artists if a)
|
distinct_artists = set(a for a in track_artists if a)
|
||||||
if hints.dir_artist:
|
|
||||||
# Verzeichnisname nennt explizit einen Künstler → immer verwenden
|
_bad_aa = {"various artists", "unknown artist", "unknown", "va"}
|
||||||
|
def _good_aa(s: Optional[str]) -> bool:
|
||||||
|
return bool(s) and s.casefold().strip() not in _bad_aa
|
||||||
|
|
||||||
|
if _good_aa(cl_albumartist) and confidence < 0.4:
|
||||||
|
# LLM kennt den echten Albumkünstler besser als der Verzeichnisname
|
||||||
|
albumartist = cl_albumartist # type: ignore[assignment]
|
||||||
|
elif hints.dir_artist:
|
||||||
albumartist = hints.dir_artist
|
albumartist = hints.dir_artist
|
||||||
elif len(distinct_artists) >= 3:
|
elif len(distinct_artists) >= 3:
|
||||||
albumartist = "Various Artists"
|
albumartist = "Various Artists"
|
||||||
|
|
@ -452,7 +486,9 @@ def resolve(
|
||||||
confidence = min(confidence, 1.0)
|
confidence = min(confidence, 1.0)
|
||||||
|
|
||||||
# Build track proposals
|
# Build track proposals
|
||||||
track_proposals = _build_track_proposals(hints, mb_tracks, album, artist)
|
# `artist` = Komponist/Hauptkünstler (LLM-aufgelöst), `albumartist` = Performer
|
||||||
|
# Werden beide weitergegeben damit _build_track_proposals richtig zuordnen kann.
|
||||||
|
track_proposals = _build_track_proposals(hints, mb_tracks, album, albumartist, composer=artist)
|
||||||
|
|
||||||
return AlbumProposal(
|
return AlbumProposal(
|
||||||
album_dir=hints.album_dir,
|
album_dir=hints.album_dir,
|
||||||
|
|
@ -476,15 +512,26 @@ def _build_track_proposals(
|
||||||
mb_tracks: Optional[List],
|
mb_tracks: Optional[List],
|
||||||
album: str,
|
album: str,
|
||||||
album_artist: str,
|
album_artist: str,
|
||||||
|
composer: Optional[str] = None,
|
||||||
) -> List[TrackProposal]:
|
) -> List[TrackProposal]:
|
||||||
proposals: List[TrackProposal] = []
|
proposals: List[TrackProposal] = []
|
||||||
|
|
||||||
for th in sorted(hints.tracks, key=lambda t: (t.disc_number or 1, t.track_number or 9999, str(t.path))):
|
for th in sorted(hints.tracks, key=lambda t: (t.disc_number or 1, t.track_number or 9999, str(t.path))):
|
||||||
title = th.title
|
title = th.title
|
||||||
artist = th.artist or album_artist
|
|
||||||
track_num = th.track_number
|
track_num = th.track_number
|
||||||
disc_num = th.disc_number
|
disc_num = th.disc_number
|
||||||
|
|
||||||
|
# Klassik-Fall: Performer aus Dateiname, Komponist aus LLM
|
||||||
|
# Wenn th.artist == albumartist (Performer), und wir den Komponisten kennen,
|
||||||
|
# wird der Komponist als Track-Artist gesetzt → Filename: TT_-_Performer_-_Komponist_-_Werk
|
||||||
|
th_artist_cf = (th.artist or "").casefold().strip()
|
||||||
|
aa_cf = album_artist.casefold().strip()
|
||||||
|
if composer and th_artist_cf == aa_cf and th_artist_cf:
|
||||||
|
# Performer == albumartist → Komponist als Track-Artist
|
||||||
|
artist = composer
|
||||||
|
else:
|
||||||
|
artist = th.artist or album_artist
|
||||||
|
|
||||||
# Try to match from MusicBrainz track list
|
# Try to match from MusicBrainz track list
|
||||||
if mb_tracks and track_num:
|
if mb_tracks and track_num:
|
||||||
for mb_t in mb_tracks:
|
for mb_t in mb_tracks:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue