Audio-Download-Endpunkt GET /audio/{job_id} hinzufügen

- SpeakRequest: keep_audio=true speichert WAV in ~/.cache/chatterbox-tts/ - SpeakJob: audio_path-Feld für gespeicherte WAV-Datei - GET /audio/{job_id}: liefert WAV als FileResponse, löscht Datei danach - mcp_adapter: keep_audio-Parameter in speak() weitergereicht - Docstring: neuen Endpunkt dokumentiert Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-03 21:09:06 +02:00 · 2026-06-03 21:09:06 +02:00 · fe74b84360
commit fe74b84360
parent 69de37d1a0
2 changed files with 93 additions and 17 deletions
--- a/mcp_adapter.py
+++ b/mcp_adapter.py
@ -61,6 +61,7 @@ async def speak(
    interrupt: bool = False,
    speed: float = 1.0,
    session_id: str | None = None,
    keep_audio: bool = False,
 ) -> dict:
    """Text als Sprache ausgeben.
@ -68,14 +69,16 @@ async def speak(
    satzweise und beginnt sofort mit der Wiedergabe.
    Args:
-        text:       Auszugebender Text (max. 4000 Zeichen).
+        text:        Auszugebender Text (max. 4000 Zeichen).
-        lang:       Sprachcode, z. B. 'de', 'en', 'fr'. Standard: 'de'.
+        lang:        Sprachcode, z. B. 'de', 'en', 'fr'. Standard: 'de'.
-        voice:      Optionaler Pfad zu einer WAV-Referenzdatei (10–30s) für
+        voice:       Optionaler Pfad zu einer WAV-Referenzdatei (10–30s) für
-                    Voice Cloning.
+                     Voice Cloning.
-        interrupt:  True = laufende Ausgabe sofort unterbrechen und diesen
+        interrupt:   True = laufende Ausgabe sofort unterbrechen und diesen
-                    Text vorgezogen abspielen.
+                     Text vorgezogen abspielen.
-        speed:      Wiedergabegeschwindigkeit (0.5–2.0). Pitch bleibt gleich.
+        speed:       Wiedergabegeschwindigkeit (0.5–2.0). Pitch bleibt gleich.
-        session_id: Optionale Session-ID für Job-Tracking im TTS-Service.
+        session_id:  Optionale Session-ID für Job-Tracking im TTS-Service.
        keep_audio:  True = WAV-Datei nach der Synthese im Cache behalten;
                     abrufbar via GET /audio/{job_id}.
    """
    async with httpx.AsyncClient(timeout=30) as client:
        r = await client.post(f"{TTS_URL}/speak", json={
@ -85,6 +88,7 @@ async def speak(
            "interrupt": interrupt,
            "speed": speed,
            "session_id": session_id,
            "keep_audio": keep_audio,
        })
        _raise_for_status(r)
        return r.json()
--- a/tts_service.py
+++ b/tts_service.py
@ -6,14 +6,18 @@ Start:
    uvicorn tts_service:app --host 0.0.0.0 --port 9999
 Endpunkte:
-    POST /speak   – Text in Warteschlange einreihen
+    POST /speak          – Text in Warteschlange einreihen
-    POST /stop    – laufende Ausgabe abbrechen, Queue leeren
+    POST /stop           – laufende Ausgabe abbrechen, Queue leeren
-    GET  /health  – Service-Status
+    POST /pause          – Ausgabe pausieren (ohne Datenverlust)
-    GET  /status  – aktueller Job + Queue-Länge
+    POST /resume         – pausierte Ausgabe fortsetzen
-    GET  /voices  – unterstützte Sprachen
+    GET  /audio/{job_id} – fertige WAV herunterladen (nur wenn keep_audio=true)
    GET  /health         – Service-Status
    GET  /status         – aktueller Job + Queue-Länge
    GET  /voices         – unterstützte Sprachen
 """
 from __future__ import annotations
 import os
 import queue
 import sys
 import threading
@ -30,8 +34,13 @@ import chatterbox_cli_v4 as tts  # noqa: E402
 import torch
 import torchaudio as ta
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import FileResponse
 from pydantic import BaseModel, Field
 # Verzeichnis für temporäre Audio-Downloads (keep_audio=True)
 _AUDIO_CACHE_DIR = Path.home() / ".cache" / "chatterbox-tts"
 _AUDIO_CACHE_DIR.mkdir(parents=True, exist_ok=True)
 # ---------------------------------------------------------------------------
 # Gerät einmalig bestimmen
 # ---------------------------------------------------------------------------
@ -89,11 +98,13 @@ class SpeakJob:
    output_path: Optional[str]
    pronunciation_dict: Optional[dict]
    session_id: Optional[str]
    keep_audio: bool = False
    status: JobStatus = field(default=JobStatus.pending)
    text_preview: str = field(default="")
    chunks_total: int = 0
    chunks_done: int = 0
    error: Optional[str] = None
    audio_path: Optional[str] = None  # gesetzt wenn keep_audio=True und Job fertig
 # ---------------------------------------------------------------------------
@ -153,11 +164,18 @@ def _worker() -> None:
            finally:
                playback.stop()
-            if job.save_wav and job.output_path and wavs:
+            if wavs:
                out = Path(job.output_path)
                out.parent.mkdir(parents=True, exist_ok=True)
                final = wavs[0] if len(wavs) == 1 else torch.cat(wavs, dim=-1)
-                ta.save(str(out), final, sr)
+
                if job.save_wav and job.output_path:
                    out = Path(job.output_path)
                    out.parent.mkdir(parents=True, exist_ok=True)
                    ta.save(str(out), final, sr)
                if job.keep_audio:
                    cache_path = _AUDIO_CACHE_DIR / f"{job.id}.wav"
                    ta.save(str(cache_path), final, sr)
                    job.audio_path = str(cache_path)
            job.status = (
                JobStatus.cancelled if tts.stop_requested() else JobStatus.done
@ -196,6 +214,7 @@ class SpeakRequest(BaseModel):
    output_path: Optional[str] = None
    session_id: Optional[str] = None
    pronunciation_dict: Optional[dict] = None
    keep_audio: bool = False  # WAV im Cache behalten für GET /audio/{job_id}
 def _job_to_dict(j: SpeakJob) -> dict:
@ -264,6 +283,7 @@ def speak(req: SpeakRequest):
        output_path=req.output_path,
        pronunciation_dict=req.pronunciation_dict,
        session_id=req.session_id,
        keep_audio=req.keep_audio,
    )
    _job_queue.put(job)
@ -293,6 +313,58 @@ def resume():
    return {"resumed": True}
@app.get("/audio/{job_id}")
 def download_audio(job_id: str):
    """Fertige WAV-Datei herunterladen (nur wenn speak mit keep_audio=true aufgerufen wurde).
    Die Datei wird nach dem Download automatisch gelöscht.
    Ist der Job noch nicht fertig, wird 202 zurückgegeben.
    """
    with _state_lock:
        cur = _current_job
        recent = list(_recent_jobs)
    # Laufenden Job prüfen
    if cur and cur.id == job_id:
        raise HTTPException(status_code=202, detail="Job läuft noch — bitte später erneut abrufen.")
    # In den letzten Jobs suchen
    job = next((j for j in recent if j.id == job_id), None)
    if job is None:
        raise HTTPException(status_code=404, detail=f"Job nicht gefunden: {job_id}")
    if job.status == JobStatus.pending or job.status == JobStatus.running:
        raise HTTPException(status_code=202, detail="Job läuft noch — bitte später erneut abrufen.")
    if not job.audio_path or not Path(job.audio_path).exists():
        if not job.keep_audio:
            raise HTTPException(
                status_code=404,
                detail="Keine Audio-Datei vorhanden. Bitte /speak mit keep_audio=true aufrufen.",
            )
        raise HTTPException(status_code=404, detail="Audio-Datei nicht mehr vorhanden.")
    audio_path = Path(job.audio_path)
    def cleanup_after_send():
        try:
            os.unlink(audio_path)
            job.audio_path = None
        except OSError:
            pass
    response = FileResponse(
        path=str(audio_path),
        media_type="audio/wav",
        filename=f"tts_{job_id[:8]}.wav",
        background=None,
    )
    # Datei nach dem Senden löschen — via BackgroundTask
    from starlette.background import BackgroundTask
    response.background = BackgroundTask(cleanup_after_send)
    return response
@app.get("/status")
 def status():
    with _state_lock: