Music_Metadata_Enricher/hint_extractor.py

332 lines
12 KiB
Python
Raw Normal View History

from __future__ import annotations
import re
import sys
from pathlib import Path
from typing import Optional, List, Dict, Tuple
from models import AlbumScan, AlbumHints, TrackHints
try:
from mutagen import File as MutagenFile
HAS_MUTAGEN = True
except ImportError:
HAS_MUTAGEN = False
try:
from bs4 import BeautifulSoup
HAS_BS4 = True
except ImportError:
HAS_BS4 = False
_NATSORT_RE = re.compile(r"(\d+)")
_BAD_VALUES = {"unknown", "unknown artist", "unknown album", "untitled", "track", "va", "various"}
# Filename patterns: most specific first
_FILENAME_PATTERNS = [
re.compile(r"^(?P<disc>\d{1,2})[- _]+(?P<track>\d{1,3})\s*[-._ ]+\s*(?P<artist>.+?)\s*[-]\s*(?P<title>.+)$"),
re.compile(r"^(?P<disc>\d{1,2})[- _]+(?P<track>\d{1,3})\s*[-._ ]+\s*(?P<title>.+)$"),
re.compile(r"^(?P<track>\d{1,3})\s*[-._ ]+\s*(?P<artist>.+?)\s*[-]\s*(?P<title>.+)$"),
re.compile(r"^(?P<track>\d{1,3})\s*[-._ ]+\s*(?P<title>.+)$"),
re.compile(r"^(?P<artist>.+?)\s*[-]\s*(?P<title>.+)$"),
]
# Directory name patterns
_DIR_PATTERNS = [
re.compile(r"^(?P<artist>.+?)[_ -]+[-][_ -]+(?P<album>.+?)(?:[_ -]+(?P<year>\d{4}))?$"),
re.compile(r"^(?P<artist>.+?)[_ ]+(?P<year>\d{4})[._ -]+(?P<album>.+)$"),
re.compile(r"^(?P<album>.+?)[_ -]+(?P<year>\d{4})$"),
]
# Tracklist line patterns
_TRACKLIST_PATTERNS = [
re.compile(r"^(?P<disc>\d{1,2})[- _](?P<track>\d{1,3})\s+(?P<title>.+?)(?:\s+\d+:\d{2})?$"),
re.compile(r"^(?P<track>\d{1,3})[.):\s]+(?P<title>.+?)(?:\s+\d+:\d{2})?$"),
re.compile(r"^(?P<track>[A-Z]\d{1,2})[.):\s]+(?P<title>.+?)(?:\s+\d+:\d{2})?$"),
]
_DISC_SECTION_RE = re.compile(r"(?i)(?:cd|disc|disk|side)[_ \-]*(\d{1,2})")
def _clean(s: Optional[str]) -> str:
if not s:
return ""
# BOM (U+FEFF), Zero-Width-Space (U+200B), Soft-Hyphen (U+00AD) entfernen
s = re.sub(r"[­]", "", s)
return re.sub(r"\s+", " ", s.replace("_", " ")).strip(" -._")
def _is_good(v: Optional[str]) -> bool:
if not v:
return False
return _clean(v).casefold() not in _BAD_VALUES
def _parse_dirname(name: str) -> Tuple[Optional[str], Optional[str], Optional[str]]:
name_clean = _clean(name)
for pat in _DIR_PATTERNS:
m = pat.match(name_clean)
if m:
d = m.groupdict()
artist = _clean(d.get("artist")) or None
album = _clean(d.get("album")) or None
year = d.get("year")
if _is_good(artist) or _is_good(album):
return artist, album, year
# No pattern matched — treat whole name as album
return None, _clean(name_clean), None
def _parse_filename(stem: str) -> Dict[str, str]:
stem_clean = _clean(stem)
for pat in _FILENAME_PATTERNS:
m = pat.match(stem_clean)
if m:
return {k: _clean(v) for k, v in m.groupdict().items() if v}
return {"title": stem_clean}
def _read_tags(path: Path) -> Tuple[Dict[str, str], Optional[float]]:
if not HAS_MUTAGEN:
return {}, None
try:
audio = MutagenFile(str(path), easy=True)
if not audio:
return {}, None
tags: Dict[str, str] = {}
for k in ("title", "artist", "album", "albumartist", "tracknumber",
"discnumber", "date", "year", "genre", "label", "organization"):
v = audio.get(k)
if v:
tags[k] = str(v[0]).strip()
if "year" in tags and "date" not in tags:
tags["date"] = tags["year"]
duration = None
if hasattr(audio, "info") and audio.info and hasattr(audio.info, "length"):
duration = audio.info.length
return tags, duration
except Exception as e:
print(f" ⚠️ Tag-Lesefehler {path.name}: {e}", file=sys.stderr)
return {}, None
def _parse_tracklist(text: str) -> List[Dict[str, str]]:
tracks: List[Dict[str, str]] = []
current_disc = 1
for line in text.splitlines():
line = line.strip()
if not line:
continue
disc_m = _DISC_SECTION_RE.match(line)
if disc_m and len(line) < 30:
current_disc = int(disc_m.group(1))
continue
for pat in _TRACKLIST_PATTERNS:
m = pat.match(line)
if m:
d = m.groupdict()
entry: Dict[str, str] = {"title": _clean(d.get("title", ""))}
raw_track = d.get("track", "")
if raw_track and raw_track.isdigit():
entry["track"] = raw_track.lstrip("0") or "0"
elif raw_track:
entry["track"] = raw_track
if "disc" in d and d["disc"]:
entry["disc"] = d["disc"]
else:
entry["disc"] = str(current_disc)
if entry.get("title"):
tracks.append(entry)
break
return tracks
def _parse_m3u(text: str) -> List[Dict[str, str]]:
"""M3U/M3U8 → geordnete Liste: [{filename, title, position}].
Reihenfolge der Einträge = gewünschte Trackreihenfolge.
"""
tracks: List[Dict[str, str]] = []
pending_title: Optional[str] = None
position = 0
for line in text.splitlines():
line = line.strip()
if not line:
continue
if line.upper().startswith("#EXTINF:"):
parts = line.split(",", 1)
pending_title = parts[1].strip() if len(parts) > 1 else None
elif not line.startswith("#"):
filename = Path(line.replace("\\", "/")).name
if not filename:
continue
position += 1
tracks.append({
"position": str(position),
"filename": filename,
"title": pending_title or "",
})
pending_title = None
return tracks
def _read_tracklist_file(path: Path) -> Optional[str]:
try:
if path.suffix.lower() in (".htm", ".html"):
raw = path.read_bytes()
encoding = "utf-8"
for enc in ("utf-8", "latin-1", "cp1252"):
try:
raw.decode(enc)
encoding = enc
break
except UnicodeDecodeError:
continue
text = raw.decode(encoding, errors="replace")
if HAS_BS4:
soup = BeautifulSoup(text, "html.parser")
return soup.get_text(separator="\n")
# Fallback: strip HTML tags
return re.sub(r"<[^>]+>", " ", text)
else:
for enc in ("utf-8", "latin-1", "cp1252"):
try:
return path.read_text(encoding=enc)
except UnicodeDecodeError:
continue
except Exception as e:
print(f" ⚠️ Tracklist-Lesefehler {path.name}: {e}", file=sys.stderr)
return None
def _check_cover_images(paths: List[Path]) -> List[Path]:
good: List[Path] = []
for p in paths:
name_lower = p.name.lower()
# Prefer front covers
if any(kw in name_lower for kw in ("front", "folder", "cover", "album")):
good.insert(0, p)
else:
good.append(p)
return good
def extract_hints(scan: AlbumScan) -> AlbumHints:
hints = AlbumHints(album_dir=scan.album_dir)
# Directory name
hints.dir_artist, hints.dir_album, hints.dir_year = _parse_dirname(scan.album_dir.name)
# Cover images
hints.cover_images = _check_cover_images(scan.image_files)
# Tracklist files
texts: List[str] = []
for tf in scan.tracklist_files:
txt = _read_tracklist_file(tf)
if txt:
texts.append(txt)
hints.tracklist_text = "\n\n".join(texts) if texts else None
parsed_tracklist = _parse_tracklist(hints.tracklist_text) if hints.tracklist_text else []
# M3U/Playlist-Reihenfolge: filename (stem, normalisiert) → Tracknummer
m3u_order: Dict[str, int] = {}
m3u_titles: Dict[str, str] = {}
for pf in scan.playlist_files:
try:
text = pf.read_text(encoding="utf-8", errors="replace")
for entry in _parse_m3u(text):
stem = _clean(Path(entry["filename"]).stem).casefold()
pos = int(entry["position"])
if stem and stem not in m3u_order:
m3u_order[stem] = pos
if entry.get("title"):
m3u_titles[stem] = entry["title"]
except Exception as e:
print(f" ⚠️ Playlist-Lesefehler {pf.name}: {e}", file=sys.stderr)
# Tracklist-Lookup: normalisierter Titel → Eintrag (für titelbasiertes Matching)
tl_by_title: Dict[str, Dict[str, str]] = {}
for entry in parsed_tracklist:
key = _clean(entry.get("title", "")).casefold()
if key:
tl_by_title[key] = entry
# Build TrackHints per audio file
for audio_path in sorted(scan.audio_files):
tags, duration = _read_tags(audio_path)
fn_hints = _parse_filename(audio_path.stem)
track_num: Optional[int] = None
disc_num: Optional[int] = None
# Track number: tag > filename
raw_tn = tags.get("tracknumber") or fn_hints.get("track")
if raw_tn:
try:
tn_int = int(str(raw_tn).split("/")[0])
if tn_int > 0: # 0 gilt als "keine Nummer"
track_num = tn_int
except ValueError:
pass
# Track number aus M3U-Reihenfolge (Vorrang vor Dateiname, aber nicht vor Tag)
if track_num is None:
stem_key = _clean(audio_path.stem).casefold()
if stem_key in m3u_order:
track_num = m3u_order[stem_key]
# Disc number: tag > filename > path segment
raw_dn = tags.get("discnumber") or fn_hints.get("disc")
if raw_dn:
try:
disc_num = int(str(raw_dn).split("/")[0])
except ValueError:
pass
if not disc_num:
for part in audio_path.relative_to(scan.album_dir).parts[:-1]:
dm = _DISC_SECTION_RE.search(part)
if dm:
disc_num = int(dm.group(1))
break
title = tags.get("title") or fn_hints.get("title")
artist = tags.get("artist") or fn_hints.get("artist")
# Tracklist: erst nach Nummer, dann nach Titel
if parsed_tracklist:
matched_tl: Optional[Dict[str, str]] = None
if track_num:
for tl_entry in parsed_tracklist:
tl_track = tl_entry.get("track")
tl_disc = tl_entry.get("disc", "1")
if (tl_track and int(tl_track) == track_num
and int(tl_disc) == (disc_num or 1)):
matched_tl = tl_entry
break
if matched_tl is None and title:
matched_tl = tl_by_title.get(_clean(title).casefold())
if matched_tl and not _is_good(title) and _is_good(matched_tl.get("title")):
title = matched_tl["title"]
# M3U-Titel als Fallback (enthält "Composer - Title" — nur nutzen wenn kein besserer Titel)
if not _is_good(title):
stem_key = _clean(audio_path.stem).casefold()
if stem_key in m3u_titles:
title = m3u_titles[stem_key]
hints.tracks.append(TrackHints(
path=audio_path,
track_number=track_num,
disc_number=disc_num,
title=_clean(title) if title else None,
artist=_clean(artist) if artist else None,
duration=duration,
existing_tags=tags,
))
return hints