Add Vision-LLM mode for direct image-to-JSON extraction
Tesseract OCR fails on rotated/low-contrast CD back covers. New vision_llm module sends images directly to qwen3-vl via Ollama chat API, bypassing OCR entirely. Robust JSON extraction handles thinking tags, markdown blocks, and empty responses. CLI scan/process commands gain --vision flag. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
686c4317d1
commit
1753ab204f
5 changed files with 359 additions and 55 deletions
37
tests/test_vision_llm.py
Normal file
37
tests/test_vision_llm.py
Normal file
|
|
@ -0,0 +1,37 @@
|
|||
"""Tests für die Vision-LLM JSON-Extraktion."""
|
||||
|
||||
import pytest
|
||||
|
||||
from musiksammlung.vision_llm import _extract_json
|
||||
|
||||
|
||||
def test_extract_pure_json():
|
||||
text = '{"artist": "Test", "album": "Album"}'
|
||||
assert '"Test"' in _extract_json(text)
|
||||
|
||||
|
||||
def test_extract_json_from_markdown_block():
|
||||
text = 'Hier ist das Ergebnis:\n```json\n{"artist": "Test"}\n```\nFertig.'
|
||||
assert '"Test"' in _extract_json(text)
|
||||
|
||||
|
||||
def test_extract_json_with_thinking_tags():
|
||||
text = '<think>Ich denke nach...</think>\n{"artist": "Test", "album": "X"}'
|
||||
result = _extract_json(text)
|
||||
assert '"Test"' in result
|
||||
|
||||
|
||||
def test_extract_json_with_surrounding_text():
|
||||
text = 'Das JSON:\n{"artist": "A", "album": "B"}\nEnde.'
|
||||
result = _extract_json(text)
|
||||
assert '"A"' in result
|
||||
|
||||
|
||||
def test_extract_json_empty_raises():
|
||||
with pytest.raises(ValueError, match="Leere Antwort"):
|
||||
_extract_json("")
|
||||
|
||||
|
||||
def test_extract_json_no_json_raises():
|
||||
with pytest.raises(ValueError, match="Kein JSON"):
|
||||
_extract_json("Hier ist kein JSON, nur Text.")
|
||||
Loading…
Add table
Add a link
Reference in a new issue