Erweiterung: Stop-Mechanismus, REST-Service und MCP-Adapter

- chatterbox_cli_v4.py: kooperativer Stop-Mechanismus via threading.Event (STOP_REQUESTED, request_stop, clear_stop); PlaybackWorker, synthesize_non_streaming und synthesize_streaming prüfen das Event vor jedem Chunk; --stop CLI-Flag - tts_service.py: FastAPI-Service mit Modell-Caching, Job-Queue und Worker-Thread; Endpunkte: POST /speak, POST /stop, GET /health, GET /status, GET /voices - mcp_adapter.py: MCP-Adapter (stdio/streamable-http) über tts_service; Tools: speak, stop, get_status, list_voices - requirements.txt: fastapi, uvicorn, httpx, mcp ergänzt - CLAUDE.md: Architektur und Startbefehle dokumentiert - .gitignore: Ideen/-Verzeichnis ausgeschlossen Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-16 09:46:43 +02:00 · 2026-05-16 09:46:43 +02:00 · bcf6374c29
commit bcf6374c29
parent bed29fb1c8
6 changed files with 563 additions and 3 deletions
--- a/chatterbox_cli_v4.py
+++ b/chatterbox_cli_v4.py
@ -10,6 +10,20 @@ import time
 from pathlib import Path
 from typing import List, Optional, Tuple

+# ---------------------------------------------------------------------------
+# Kooperativer Stop-Mechanismus
+# ---------------------------------------------------------------------------
+STOP_REQUESTED = threading.Event()
+
+def request_stop() -> None:
+    STOP_REQUESTED.set()
+
+def clear_stop() -> None:
+    STOP_REQUESTED.clear()
+
+def stop_requested() -> bool:
+    return STOP_REQUESTED.is_set()
+
 import torch
 import torchaudio as ta

@ -556,10 +570,12 @@ class PlaybackWorker:
    PLAYBACK_RATE = 48000  # PipeWire/PulseAudio standard
    CALLBACK_BLOCK = 2048  # ~43 ms pro Callback-Block bei 48 kHz

-    def __init__(self, sample_rate: int, device: Optional[str] = "pulse", speed: float = 1.0):
+    def __init__(self, sample_rate: int, device: Optional[str] = "pulse", speed: float = 1.0,
+                 stop_event: Optional[threading.Event] = None):
        self.sample_rate = sample_rate
        self.device = device
        self.speed = speed
+        self.stop_event = stop_event
        # Eingang: Torch-Tensoren vom TTS-Modell
        self.audio_queue: "queue.Queue[Optional[torch.Tensor]]" = queue.Queue()
        # Intern: fertig vorbereitete numpy-Blöcke für den Callback
@ -579,6 +595,9 @@ class PlaybackWorker:

    def _callback(self, outdata, frames, time_info, status):
        # Läuft im Audio-Thread: so schnell wie möglich, kein Lock nötig.
+        if self.stop_event and self.stop_event.is_set():
+            outdata[:] = 0.0
+            return
        try:
            data = self._block_queue.get_nowait()
            outdata[:, 0] = data
@ -593,6 +612,8 @@ class PlaybackWorker:
        remainder = np.zeros(0, dtype="float32")

        while True:
+            if self.stop_event and self.stop_event.is_set():
+                break
            item = self.audio_queue.get()
            if item is None:
                break
@ -690,6 +711,7 @@ def synthesize_non_streaming(
    debug_delay: float = 0.0,
    t3_model: Optional[str] = None,
    pronunciation_dict: Optional[dict] = None,
+    stop_event: Optional[threading.Event] = None,
 ) -> Optional[Path]:
    if lang not in SUPPORTED_LANGS:
        raise ValueError(
@ -739,7 +761,8 @@ def synthesize_non_streaming(
            print(f"Ausgabe: {output_path}")

    if play_audio:
-        playback = PlaybackWorker(sample_rate=sr, device=audio_device, speed=speed)
+        playback = PlaybackWorker(sample_rate=sr, device=audio_device, speed=speed,
+                                  stop_event=stop_event)
        playback.start()
    else:
        playback = None
@ -747,6 +770,10 @@ def synthesize_non_streaming(
    wavs = []
    try:
        for i, chunk in enumerate(chunks, start=1):
+            if stop_event and stop_event.is_set():
+                if show_progress:
+                    print("Abbruch angefordert – Synthese gestoppt.")
+                break
            if debug_delay > 0:
                if show_progress:
                    print(f"[{i}/{len(chunks)}] Warte {debug_delay:.0f}s (debug_delay) ...")
@ -793,6 +820,7 @@ def synthesize_streaming(
    save_wav: bool = True,
    stream_chunk_size: int = 25,
    audio_device: Optional[str] = None,
+    stop_event: Optional[threading.Event] = None,
 ) -> Optional[Path]:
    if lang not in SUPPORTED_LANGS:
        raise ValueError(
@ -829,7 +857,7 @@ def synthesize_streaming(
        raise ValueError("Kein verwertbarer Text nach dem Einlesen gefunden.")

    if play_audio:
-        playback = PlaybackWorker(sample_rate=sr, device=audio_device)
+        playback = PlaybackWorker(sample_rate=sr, device=audio_device, stop_event=stop_event)
        playback.start()
    else:
        playback = None
@ -853,6 +881,10 @@ def synthesize_streaming(

    try:
        for text_idx, text_chunk in enumerate(text_chunks, start=1):
+            if stop_event and stop_event.is_set():
+                if show_progress:
+                    print("Abbruch angefordert – Streaming gestoppt.")
+                break
            if show_progress:
                print(f"[Text {text_idx}/{len(text_chunks)}] Starte Streaming für {len(text_chunk)} Zeichen ...")

@ -866,6 +898,8 @@ def synthesize_streaming(
            )

            for audio_idx, item in enumerate(stream_iter, start=1):
+                if stop_event and stop_event.is_set():
+                    break
                if isinstance(item, tuple) and len(item) == 2:
                    audio_chunk, metrics = item
                else:
@ -944,6 +978,7 @@ def build_argparser() -> argparse.ArgumentParser:
    p.add_argument("--debug-delay", type=float, default=0.0, help="Sekunden Pause vor jedem Satz (simuliert langsame KI). Nur zum Testen.")
    p.add_argument("--t3-model", type=str, default="v3", help="Multilingual T3-Modell: 'v3' (default), 'v2' oder Dateiname.")
    p.add_argument("--no-conversation-mode", action="store_true", help="Ersten Chunk nicht künstlich kleiner machen (nur ohne --no-sentence-mode).")
+    p.add_argument("--stop", action="store_true", help="Globales Stop-Signal setzen (für Tests und Service-Integration).")
    return p


@ -951,6 +986,11 @@ def main() -> int:
    parser = build_argparser()
    args = parser.parse_args()

+    if args.stop:
+        request_stop()
+        print("Stop-Signal gesetzt.")
+        return 0
+
    try:
        text = read_input_text(args.text, args.input)
        device = get_device(args.device)
@ -970,6 +1010,8 @@ def main() -> int:
                raise FileNotFoundError(f"Aussprache-Dict nicht gefunden: {pron_path}")
            pronunciation_dict = json.loads(pron_path.read_text(encoding="utf-8"))

+        clear_stop()
+
        if args.stream:
            out = synthesize_streaming(
                text=text,
@ -990,6 +1032,7 @@ def main() -> int:
                save_wav=save_wav,
                stream_chunk_size=args.stream_chunk_size,
                audio_device=args.audio_device,
+                stop_event=STOP_REQUESTED,
            )
        else:
            out = synthesize_non_streaming(
@ -1015,6 +1058,7 @@ def main() -> int:
                debug_delay=args.debug_delay,
                t3_model=args.t3_model,
                pronunciation_dict=pronunciation_dict,
+                stop_event=STOP_REQUESTED,
            )

        if out is not None: