Initial implementation of Music Metadata Enricher
AI-powered per-album pipeline: scan → local hints → MusicBrainz/Discogs/Claude resolve → cover art → interactive or auto review → tag write + rename + report. All external dependencies optional; 17/17 unit tests passing. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b273052f68
commit
f7cf520dbe
8 changed files with 1748 additions and 0 deletions
260
hint_extractor.py
Normal file
260
hint_extractor.py
Normal file
|
|
@ -0,0 +1,260 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
|
||||
from models import AlbumScan, AlbumHints, TrackHints
|
||||
|
||||
try:
|
||||
from mutagen import File as MutagenFile
|
||||
HAS_MUTAGEN = True
|
||||
except ImportError:
|
||||
HAS_MUTAGEN = False
|
||||
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
HAS_BS4 = True
|
||||
except ImportError:
|
||||
HAS_BS4 = False
|
||||
|
||||
_NATSORT_RE = re.compile(r"(\d+)")
|
||||
_BAD_VALUES = {"unknown", "unknown artist", "unknown album", "untitled", "track", "va", "various"}
|
||||
|
||||
# Filename patterns: most specific first
|
||||
_FILENAME_PATTERNS = [
|
||||
re.compile(r"^(?P<disc>\d{1,2})[- _]+(?P<track>\d{1,3})\s*[-._ ]+\s*(?P<artist>.+?)\s*[-–]\s*(?P<title>.+)$"),
|
||||
re.compile(r"^(?P<disc>\d{1,2})[- _]+(?P<track>\d{1,3})\s*[-._ ]+\s*(?P<title>.+)$"),
|
||||
re.compile(r"^(?P<track>\d{1,3})\s*[-._ ]+\s*(?P<artist>.+?)\s*[-–]\s*(?P<title>.+)$"),
|
||||
re.compile(r"^(?P<track>\d{1,3})\s*[-._ ]+\s*(?P<title>.+)$"),
|
||||
re.compile(r"^(?P<artist>.+?)\s*[-–]\s*(?P<title>.+)$"),
|
||||
]
|
||||
|
||||
# Directory name patterns
|
||||
_DIR_PATTERNS = [
|
||||
re.compile(r"^(?P<artist>.+?)[_ -]+[-–][_ -]+(?P<album>.+?)(?:[_ -]+(?P<year>\d{4}))?$"),
|
||||
re.compile(r"^(?P<artist>.+?)[_ ]+(?P<year>\d{4})[._ -]+(?P<album>.+)$"),
|
||||
re.compile(r"^(?P<album>.+?)[_ -]+(?P<year>\d{4})$"),
|
||||
]
|
||||
|
||||
# Tracklist line patterns
|
||||
_TRACKLIST_PATTERNS = [
|
||||
re.compile(r"^(?P<disc>\d{1,2})[- _](?P<track>\d{1,3})\s+(?P<title>.+?)(?:\s+\d+:\d{2})?$"),
|
||||
re.compile(r"^(?P<track>\d{1,3})[.):\s]+(?P<title>.+?)(?:\s+\d+:\d{2})?$"),
|
||||
re.compile(r"^(?P<track>[A-Z]\d{1,2})[.):\s]+(?P<title>.+?)(?:\s+\d+:\d{2})?$"),
|
||||
]
|
||||
|
||||
_DISC_SECTION_RE = re.compile(r"(?i)(?:cd|disc|disk|side)[_ \-]*(\d{1,2})")
|
||||
|
||||
|
||||
def _clean(s: Optional[str]) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
return re.sub(r"\s+", " ", s.replace("_", " ")).strip(" -._")
|
||||
|
||||
|
||||
def _is_good(v: Optional[str]) -> bool:
|
||||
if not v:
|
||||
return False
|
||||
return _clean(v).casefold() not in _BAD_VALUES
|
||||
|
||||
|
||||
def _parse_dirname(name: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
|
||||
name_clean = _clean(name)
|
||||
for pat in _DIR_PATTERNS:
|
||||
m = pat.match(name_clean)
|
||||
if m:
|
||||
d = m.groupdict()
|
||||
artist = _clean(d.get("artist")) or None
|
||||
album = _clean(d.get("album")) or None
|
||||
year = d.get("year")
|
||||
if _is_good(artist) or _is_good(album):
|
||||
return artist, album, year
|
||||
# No pattern matched — treat whole name as album
|
||||
return None, _clean(name_clean), None
|
||||
|
||||
|
||||
def _parse_filename(stem: str) -> Dict[str, str]:
|
||||
stem_clean = _clean(stem)
|
||||
for pat in _FILENAME_PATTERNS:
|
||||
m = pat.match(stem_clean)
|
||||
if m:
|
||||
return {k: _clean(v) for k, v in m.groupdict().items() if v}
|
||||
return {"title": stem_clean}
|
||||
|
||||
|
||||
def _read_tags(path: Path) -> Tuple[Dict[str, str], Optional[float]]:
|
||||
if not HAS_MUTAGEN:
|
||||
return {}, None
|
||||
try:
|
||||
audio = MutagenFile(str(path), easy=True)
|
||||
if not audio:
|
||||
return {}, None
|
||||
tags: Dict[str, str] = {}
|
||||
for k in ("title", "artist", "album", "albumartist", "tracknumber",
|
||||
"discnumber", "date", "year", "genre", "label", "organization"):
|
||||
v = audio.get(k)
|
||||
if v:
|
||||
tags[k] = str(v[0]).strip()
|
||||
if "year" in tags and "date" not in tags:
|
||||
tags["date"] = tags["year"]
|
||||
duration = None
|
||||
if hasattr(audio, "info") and audio.info and hasattr(audio.info, "length"):
|
||||
duration = audio.info.length
|
||||
return tags, duration
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Tag-Lesefehler {path.name}: {e}", file=sys.stderr)
|
||||
return {}, None
|
||||
|
||||
|
||||
def _parse_tracklist(text: str) -> List[Dict[str, str]]:
|
||||
tracks: List[Dict[str, str]] = []
|
||||
current_disc = 1
|
||||
|
||||
for line in text.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
disc_m = _DISC_SECTION_RE.match(line)
|
||||
if disc_m and len(line) < 30:
|
||||
current_disc = int(disc_m.group(1))
|
||||
continue
|
||||
|
||||
for pat in _TRACKLIST_PATTERNS:
|
||||
m = pat.match(line)
|
||||
if m:
|
||||
d = m.groupdict()
|
||||
entry: Dict[str, str] = {"title": _clean(d.get("title", ""))}
|
||||
raw_track = d.get("track", "")
|
||||
if raw_track and raw_track.isdigit():
|
||||
entry["track"] = raw_track.lstrip("0") or "0"
|
||||
elif raw_track:
|
||||
entry["track"] = raw_track
|
||||
if "disc" in d and d["disc"]:
|
||||
entry["disc"] = d["disc"]
|
||||
else:
|
||||
entry["disc"] = str(current_disc)
|
||||
if entry.get("title"):
|
||||
tracks.append(entry)
|
||||
break
|
||||
|
||||
return tracks
|
||||
|
||||
|
||||
def _read_tracklist_file(path: Path) -> Optional[str]:
|
||||
try:
|
||||
if path.suffix.lower() in (".htm", ".html"):
|
||||
raw = path.read_bytes()
|
||||
encoding = "utf-8"
|
||||
for enc in ("utf-8", "latin-1", "cp1252"):
|
||||
try:
|
||||
raw.decode(enc)
|
||||
encoding = enc
|
||||
break
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
text = raw.decode(encoding, errors="replace")
|
||||
if HAS_BS4:
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
return soup.get_text(separator="\n")
|
||||
# Fallback: strip HTML tags
|
||||
return re.sub(r"<[^>]+>", " ", text)
|
||||
else:
|
||||
for enc in ("utf-8", "latin-1", "cp1252"):
|
||||
try:
|
||||
return path.read_text(encoding=enc)
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Tracklist-Lesefehler {path.name}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def _check_cover_images(paths: List[Path]) -> List[Path]:
|
||||
good: List[Path] = []
|
||||
for p in paths:
|
||||
name_lower = p.name.lower()
|
||||
# Prefer front covers
|
||||
if any(kw in name_lower for kw in ("front", "folder", "cover", "album")):
|
||||
good.insert(0, p)
|
||||
else:
|
||||
good.append(p)
|
||||
return good
|
||||
|
||||
|
||||
def extract_hints(scan: AlbumScan) -> AlbumHints:
|
||||
hints = AlbumHints(album_dir=scan.album_dir)
|
||||
|
||||
# Directory name
|
||||
hints.dir_artist, hints.dir_album, hints.dir_year = _parse_dirname(scan.album_dir.name)
|
||||
|
||||
# Cover images
|
||||
hints.cover_images = _check_cover_images(scan.image_files)
|
||||
|
||||
# Tracklist files
|
||||
texts: List[str] = []
|
||||
for tf in scan.tracklist_files:
|
||||
txt = _read_tracklist_file(tf)
|
||||
if txt:
|
||||
texts.append(txt)
|
||||
hints.tracklist_text = "\n\n".join(texts) if texts else None
|
||||
|
||||
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
|
||||
|
||||
# Build TrackHints per audio file
|
||||
for audio_path in sorted(scan.audio_files):
|
||||
tags, duration = _read_tags(audio_path)
|
||||
fn_hints = _parse_filename(audio_path.stem)
|
||||
|
||||
track_num: Optional[int] = None
|
||||
disc_num: Optional[int] = None
|
||||
|
||||
# Track number: tag > filename
|
||||
raw_tn = tags.get("tracknumber") or fn_hints.get("track")
|
||||
if raw_tn:
|
||||
try:
|
||||
track_num = int(str(raw_tn).split("/")[0])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Disc number: tag > filename > path segment
|
||||
raw_dn = tags.get("discnumber") or fn_hints.get("disc")
|
||||
if raw_dn:
|
||||
try:
|
||||
disc_num = int(str(raw_dn).split("/")[0])
|
||||
except ValueError:
|
||||
pass
|
||||
if not disc_num:
|
||||
for part in audio_path.relative_to(scan.album_dir).parts[:-1]:
|
||||
dm = _DISC_SECTION_RE.search(part)
|
||||
if dm:
|
||||
disc_num = int(dm.group(1))
|
||||
break
|
||||
|
||||
title = tags.get("title") or fn_hints.get("title")
|
||||
artist = tags.get("artist") or fn_hints.get("artist")
|
||||
|
||||
# Enrich from parsed tracklist if track_num matches
|
||||
if parsed_tracklist and track_num:
|
||||
for tl_entry in parsed_tracklist:
|
||||
tl_track = tl_entry.get("track")
|
||||
tl_disc = tl_entry.get("disc", "1")
|
||||
if (tl_track and int(tl_track) == track_num
|
||||
and int(tl_disc) == (disc_num or 1)):
|
||||
if not _is_good(title) and _is_good(tl_entry.get("title")):
|
||||
title = tl_entry["title"]
|
||||
break
|
||||
|
||||
hints.tracks.append(TrackHints(
|
||||
path=audio_path,
|
||||
track_number=track_num,
|
||||
disc_number=disc_num,
|
||||
title=_clean(title) if title else None,
|
||||
artist=_clean(artist) if artist else None,
|
||||
duration=duration,
|
||||
existing_tags=tags,
|
||||
))
|
||||
|
||||
return hints
|
||||
Loading…
Add table
Add a link
Reference in a new issue