Add --from-text mode and improve LLM parser robustness
- Add --from-text/-t option to scan and process commands for pre-formatted tracklists (e.g. from Perplexity) - Refactor llm_parser to use Chat API instead of Generate API - Reuse _extract_json() from vision_llm for robust JSON extraction - Improve SYSTEM_PROMPT with strict rules (Various Artists, no invented years, no composer info in titles, /no_think) - Remove format:"json" constraint that caused empty responses Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
3d91614e66
commit
8ecade5cdc
2 changed files with 111 additions and 74 deletions
|
|
@ -28,8 +28,9 @@ logging.basicConfig(
|
|||
)
|
||||
|
||||
|
||||
def _scan_images(
|
||||
images: list[Path],
|
||||
def _scan_to_album(
|
||||
images: list[Path] | None,
|
||||
from_text: Path | None,
|
||||
vision: bool,
|
||||
vision_model: str,
|
||||
languages: str,
|
||||
|
|
@ -37,8 +38,14 @@ def _scan_images(
|
|||
model: str,
|
||||
base_url: str,
|
||||
) -> Album:
|
||||
"""Gemeinsame Scan-Logik für scan und process."""
|
||||
if vision:
|
||||
"""Gemeinsame Scan-Logik: Text-Datei, Vision-LLM oder OCR+LLM."""
|
||||
if from_text:
|
||||
text = from_text.read_text(encoding="utf-8")
|
||||
typer.echo(f"Text-Datei geladen ({len(text)} Zeichen). LLM-Parsing...")
|
||||
return parse_tracklist(
|
||||
text, backend=backend, model=model, base_url=base_url
|
||||
)
|
||||
elif vision:
|
||||
typer.echo(f"Vision-LLM ({vision_model})...")
|
||||
return parse_image(images, model=vision_model, base_url=base_url)
|
||||
else:
|
||||
|
|
@ -63,10 +70,16 @@ def _print_album_summary(album: Album) -> None:
|
|||
|
||||
@app.command()
|
||||
def scan(
|
||||
images: list[Path] = typer.Argument(..., help="Bilder der CD-Rückseite/Booklet"),
|
||||
images: list[Path] = typer.Argument(
|
||||
None, help="Bilder der CD-Rückseite/Booklet"
|
||||
),
|
||||
output: Path = typer.Option(
|
||||
"album.json", "--output", "-o", help="Ausgabe-JSON-Datei"
|
||||
),
|
||||
from_text: Path = typer.Option(
|
||||
None, "--from-text", "-t",
|
||||
help="Text/Markdown-Datei mit Trackliste (z.B. von Perplexity)",
|
||||
),
|
||||
vision: bool = typer.Option(
|
||||
False, "--vision", "-v", help="Vision-LLM statt OCR+Text-LLM"
|
||||
),
|
||||
|
|
@ -80,18 +93,31 @@ def scan(
|
|||
"http://localhost:11434", "--url", help="LLM-API-URL"
|
||||
),
|
||||
) -> None:
|
||||
"""Bilder → Album-JSON erzeugen (zur Prüfung vor dem Anwenden).
|
||||
"""Bilder oder Text → Album-JSON erzeugen (zur Prüfung vor dem Anwenden).
|
||||
|
||||
Mit --vision wird ein Vision-LLM (z.B. qwen3-vl) direkt auf die Bilder
|
||||
angewendet. Ohne --vision wird Tesseract-OCR + Text-LLM verwendet.
|
||||
Drei Modi:
|
||||
--from-text Textdatei (z.B. von Perplexity) → LLM → JSON
|
||||
--vision Bild → Vision-LLM → JSON
|
||||
(Standard) Bild → Tesseract-OCR → Text-LLM → JSON
|
||||
"""
|
||||
if from_text:
|
||||
if not from_text.exists():
|
||||
typer.echo(f"Fehler: Datei nicht gefunden: {from_text}", err=True)
|
||||
raise typer.Exit(1)
|
||||
elif not images:
|
||||
typer.echo(
|
||||
"Fehler: Bilder oder --from-text angeben.", err=True
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
else:
|
||||
for img in images:
|
||||
if not img.exists():
|
||||
typer.echo(f"Fehler: Bild nicht gefunden: {img}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
album = _scan_images(
|
||||
images, vision, vision_model, languages, backend, model, base_url
|
||||
album = _scan_to_album(
|
||||
images, from_text, vision, vision_model,
|
||||
languages, backend, model, base_url,
|
||||
)
|
||||
|
||||
output.write_text(album.model_dump_json(indent=2), encoding="utf-8")
|
||||
|
|
@ -156,6 +182,10 @@ def process(
|
|||
images: list[Path] | None = typer.Option(
|
||||
None, "--image", "-i", help="Zusätzliche Bilder für Scan"
|
||||
),
|
||||
from_text: Path = typer.Option(
|
||||
None, "--from-text", "-t",
|
||||
help="Text/Markdown-Datei mit Trackliste (z.B. von Perplexity)",
|
||||
),
|
||||
vision: bool = typer.Option(
|
||||
False, "--vision", "-v", help="Vision-LLM statt OCR+Text-LLM"
|
||||
),
|
||||
|
|
@ -175,16 +205,17 @@ def process(
|
|||
if images:
|
||||
scan_sources.extend(images)
|
||||
|
||||
if not scan_sources:
|
||||
if not from_text and not scan_sources:
|
||||
typer.echo(
|
||||
"Fehler: Mindestens ein Bild nötig (--back oder --image)", err=True
|
||||
"Fehler: --from-text, --back oder --image angeben.", err=True
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
|
||||
# 1. Scan (Vision oder OCR+LLM)
|
||||
# 1. Scan (Text-Datei, Vision oder OCR+LLM)
|
||||
typer.echo("Schritt 1/4: Bilderkennung...")
|
||||
album = _scan_images(
|
||||
scan_sources, vision, vision_model, languages, backend, model, base_url
|
||||
album = _scan_to_album(
|
||||
scan_sources or None, from_text, vision, vision_model,
|
||||
languages, backend, model, base_url,
|
||||
)
|
||||
_print_album_summary(album)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
"""LLM-basiertes Parsing von OCR-Text zu strukturierten Album-Daten."""
|
||||
"""LLM-basiertes Parsing von Text zu strukturierten Album-Daten."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
@ -9,60 +9,52 @@ import httpx
|
|||
from pydantic import ValidationError
|
||||
|
||||
from musiksammlung.models import Album
|
||||
from musiksammlung.vision_llm import _extract_json
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
Du bist ein Parser für CD-Rückseiten und Tracklisten.
|
||||
Analysiere den OCR-Text und extrahiere: Artist, Albumtitel, Jahr (falls vorhanden) \
|
||||
und für jede CD die Tracks in korrekter Reihenfolge.
|
||||
Ignoriere Werbung, Copyright-Hinweise und Kleingedrucktes.
|
||||
Du bist ein Parser für CD-Tracklisten. Extrahiere die Metadaten als JSON.
|
||||
|
||||
Regeln:
|
||||
- Wenn es Hinweise wie "CD 1", "CD 2", "Disc 1", "Disc 2" gibt, ordne die Tracks \
|
||||
der entsprechenden disc_number zu.
|
||||
- Ohne Disc-Angabe: alles als disc_number=1 behandeln.
|
||||
- Zusätze wie "live", "bonus track", "remastered" gehören in den Tracktitel.
|
||||
- Bei Unsicherheit: Feld weglassen oder null setzen, nichts erfinden.
|
||||
REGELN:
|
||||
- "artist": Wenn verschiedene Interpreten pro Track → "Various Artists". \
|
||||
NUR wenn alle Tracks denselben Interpreten haben, nimm diesen als artist.
|
||||
- "album": Der Albumtitel (z.B. "Deutsche Volkslieder").
|
||||
- "year": NUR wenn ein Jahr explizit im Text steht. Sonst null. NICHTS ERFINDEN.
|
||||
- "title": NUR der Songtitel. KEINE Komponisten, KEINE Interpreten, KEINE Zeitangaben.
|
||||
Beispiel: "Wer recht in Freuden wandern will" — NICHT \
|
||||
"Wer recht in Freuden wandern will (Klauer – Geibel)"
|
||||
- Jede Tracknummer darf nur EINMAL vorkommen. Keine Duplikate.
|
||||
- "CD 1", "CD 2", "Disc 1" etc. → eigene disc_number. Sonst disc_number=1.
|
||||
|
||||
Gib ausschließlich valides JSON zurück, kein anderer Text. Format:
|
||||
{
|
||||
"artist": "...",
|
||||
"album": "...",
|
||||
"year": 1987,
|
||||
"discs": [
|
||||
{
|
||||
"disc_number": 1,
|
||||
"name": null,
|
||||
"tracks": [
|
||||
{"track_number": 1, "title": "..."},
|
||||
{"track_number": 2, "title": "..."}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
Gib ausschließlich valides JSON zurück:
|
||||
{"artist":"Various Artists","album":"Albumname","year":null,\
|
||||
"discs":[{"disc_number":1,"name":null,\
|
||||
"tracks":[{"track_number":1,"title":"Nur der Songtitel"}]}]}
|
||||
|
||||
/no_think"""
|
||||
|
||||
|
||||
def _call_ollama(ocr_text: str, model: str, base_url: str) -> str:
|
||||
"""Ruft Ollama-API auf und gibt die Antwort als String zurück."""
|
||||
def _call_ollama(text: str, model: str, base_url: str) -> str:
|
||||
"""Ruft Ollama Chat-API auf und gibt die Antwort als String zurück."""
|
||||
response = httpx.post(
|
||||
f"{base_url}/api/generate",
|
||||
f"{base_url}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"system": SYSTEM_PROMPT,
|
||||
"prompt": ocr_text,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": text},
|
||||
],
|
||||
"stream": False,
|
||||
"format": "json",
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()["response"]
|
||||
return response.json()["message"]["content"]
|
||||
|
||||
|
||||
def _call_openai_compatible(
|
||||
ocr_text: str, model: str, base_url: str, api_key: str | None = None
|
||||
text: str, model: str, base_url: str, api_key: str | None = None
|
||||
) -> str:
|
||||
"""Ruft eine OpenAI-kompatible API auf (OpenAI, Anthropic via Proxy, etc.)."""
|
||||
headers = {}
|
||||
|
|
@ -76,9 +68,8 @@ def _call_openai_compatible(
|
|||
"model": model,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": ocr_text},
|
||||
{"role": "user", "content": text},
|
||||
],
|
||||
"response_format": {"type": "json_object"},
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
|
|
@ -87,17 +78,17 @@ def _call_openai_compatible(
|
|||
|
||||
|
||||
def parse_tracklist(
|
||||
ocr_text: str,
|
||||
text: str,
|
||||
backend: str = "ollama",
|
||||
model: str = "llama3",
|
||||
base_url: str = "http://localhost:11434",
|
||||
api_key: str | None = None,
|
||||
max_retries: int = 2,
|
||||
max_retries: int = 3,
|
||||
) -> Album:
|
||||
"""Parst OCR-Text via LLM zu einem Album-Modell.
|
||||
"""Parst Text (OCR oder Klartext) via LLM zu einem Album-Modell.
|
||||
|
||||
Args:
|
||||
ocr_text: Rohtext aus der OCR-Erkennung
|
||||
text: Eingabetext (OCR-Rohtext oder saubere Trackliste)
|
||||
backend: 'ollama' oder 'openai'
|
||||
model: Modellname
|
||||
base_url: API-Basis-URL
|
||||
|
|
@ -107,20 +98,35 @@ def parse_tracklist(
|
|||
Returns:
|
||||
Validiertes Album-Objekt
|
||||
"""
|
||||
last_error: Exception | None = None
|
||||
|
||||
for attempt in range(max_retries + 1):
|
||||
try:
|
||||
if backend == "ollama":
|
||||
raw = _call_ollama(ocr_text, model, base_url)
|
||||
raw = _call_ollama(text, model, base_url)
|
||||
else:
|
||||
raw = _call_openai_compatible(ocr_text, model, base_url, api_key)
|
||||
raw = _call_openai_compatible(text, model, base_url, api_key)
|
||||
|
||||
data = json.loads(raw)
|
||||
logger.info(
|
||||
"LLM Antwort (Versuch %d, %d Zeichen)",
|
||||
attempt + 1, len(raw),
|
||||
)
|
||||
logger.debug("Rohantwort: %s", raw[:1000])
|
||||
|
||||
json_str = _extract_json(raw)
|
||||
data = json.loads(json_str)
|
||||
album = Album.model_validate(data)
|
||||
logger.info("LLM-Parsing erfolgreich: %s - %s", album.artist, album.album)
|
||||
logger.info(
|
||||
"LLM-Parsing erfolgreich: %s - %s", album.artist, album.album
|
||||
)
|
||||
return album
|
||||
|
||||
except (json.JSONDecodeError, ValidationError) as e:
|
||||
logger.warning("Versuch %d/%d fehlgeschlagen: %s", attempt + 1, max_retries + 1, e)
|
||||
if attempt == max_retries:
|
||||
except (json.JSONDecodeError, ValidationError, ValueError) as e:
|
||||
last_error = e
|
||||
logger.warning(
|
||||
"Versuch %d/%d fehlgeschlagen: %s",
|
||||
attempt + 1, max_retries + 1, e,
|
||||
)
|
||||
|
||||
msg = f"LLM lieferte nach {max_retries + 1} Versuchen kein valides JSON"
|
||||
raise ValueError(msg) from e
|
||||
raise ValueError(msg) from last_error
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue