Erweiterung: Stop-Mechanismus, REST-Service und MCP-Adapter

- chatterbox_cli_v4.py: kooperativer Stop-Mechanismus via threading.Event
  (STOP_REQUESTED, request_stop, clear_stop); PlaybackWorker, synthesize_non_streaming
  und synthesize_streaming prüfen das Event vor jedem Chunk; --stop CLI-Flag
- tts_service.py: FastAPI-Service mit Modell-Caching, Job-Queue und Worker-Thread;
  Endpunkte: POST /speak, POST /stop, GET /health, GET /status, GET /voices
- mcp_adapter.py: MCP-Adapter (stdio/streamable-http) über tts_service; Tools:
  speak, stop, get_status, list_voices
- requirements.txt: fastapi, uvicorn, httpx, mcp ergänzt
- CLAUDE.md: Architektur und Startbefehle dokumentiert
- .gitignore: Ideen/-Verzeichnis ausgeschlossen

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Dieter Schlüter 2026-05-16 09:46:43 +02:00
commit bcf6374c29
6 changed files with 563 additions and 3 deletions

View file

@ -10,6 +10,20 @@ import time
from pathlib import Path
from typing import List, Optional, Tuple
# ---------------------------------------------------------------------------
# Kooperativer Stop-Mechanismus
# ---------------------------------------------------------------------------
STOP_REQUESTED = threading.Event()
def request_stop() -> None:
STOP_REQUESTED.set()
def clear_stop() -> None:
STOP_REQUESTED.clear()
def stop_requested() -> bool:
return STOP_REQUESTED.is_set()
import torch
import torchaudio as ta
@ -556,10 +570,12 @@ class PlaybackWorker:
PLAYBACK_RATE = 48000 # PipeWire/PulseAudio standard
CALLBACK_BLOCK = 2048 # ~43 ms pro Callback-Block bei 48 kHz
def __init__(self, sample_rate: int, device: Optional[str] = "pulse", speed: float = 1.0):
def __init__(self, sample_rate: int, device: Optional[str] = "pulse", speed: float = 1.0,
stop_event: Optional[threading.Event] = None):
self.sample_rate = sample_rate
self.device = device
self.speed = speed
self.stop_event = stop_event
# Eingang: Torch-Tensoren vom TTS-Modell
self.audio_queue: "queue.Queue[Optional[torch.Tensor]]" = queue.Queue()
# Intern: fertig vorbereitete numpy-Blöcke für den Callback
@ -579,6 +595,9 @@ class PlaybackWorker:
def _callback(self, outdata, frames, time_info, status):
# Läuft im Audio-Thread: so schnell wie möglich, kein Lock nötig.
if self.stop_event and self.stop_event.is_set():
outdata[:] = 0.0
return
try:
data = self._block_queue.get_nowait()
outdata[:, 0] = data
@ -593,6 +612,8 @@ class PlaybackWorker:
remainder = np.zeros(0, dtype="float32")
while True:
if self.stop_event and self.stop_event.is_set():
break
item = self.audio_queue.get()
if item is None:
break
@ -690,6 +711,7 @@ def synthesize_non_streaming(
debug_delay: float = 0.0,
t3_model: Optional[str] = None,
pronunciation_dict: Optional[dict] = None,
stop_event: Optional[threading.Event] = None,
) -> Optional[Path]:
if lang not in SUPPORTED_LANGS:
raise ValueError(
@ -739,7 +761,8 @@ def synthesize_non_streaming(
print(f"Ausgabe: {output_path}")
if play_audio:
playback = PlaybackWorker(sample_rate=sr, device=audio_device, speed=speed)
playback = PlaybackWorker(sample_rate=sr, device=audio_device, speed=speed,
stop_event=stop_event)
playback.start()
else:
playback = None
@ -747,6 +770,10 @@ def synthesize_non_streaming(
wavs = []
try:
for i, chunk in enumerate(chunks, start=1):
if stop_event and stop_event.is_set():
if show_progress:
print("Abbruch angefordert Synthese gestoppt.")
break
if debug_delay > 0:
if show_progress:
print(f"[{i}/{len(chunks)}] Warte {debug_delay:.0f}s (debug_delay) ...")
@ -793,6 +820,7 @@ def synthesize_streaming(
save_wav: bool = True,
stream_chunk_size: int = 25,
audio_device: Optional[str] = None,
stop_event: Optional[threading.Event] = None,
) -> Optional[Path]:
if lang not in SUPPORTED_LANGS:
raise ValueError(
@ -829,7 +857,7 @@ def synthesize_streaming(
raise ValueError("Kein verwertbarer Text nach dem Einlesen gefunden.")
if play_audio:
playback = PlaybackWorker(sample_rate=sr, device=audio_device)
playback = PlaybackWorker(sample_rate=sr, device=audio_device, stop_event=stop_event)
playback.start()
else:
playback = None
@ -853,6 +881,10 @@ def synthesize_streaming(
try:
for text_idx, text_chunk in enumerate(text_chunks, start=1):
if stop_event and stop_event.is_set():
if show_progress:
print("Abbruch angefordert Streaming gestoppt.")
break
if show_progress:
print(f"[Text {text_idx}/{len(text_chunks)}] Starte Streaming für {len(text_chunk)} Zeichen ...")
@ -866,6 +898,8 @@ def synthesize_streaming(
)
for audio_idx, item in enumerate(stream_iter, start=1):
if stop_event and stop_event.is_set():
break
if isinstance(item, tuple) and len(item) == 2:
audio_chunk, metrics = item
else:
@ -944,6 +978,7 @@ def build_argparser() -> argparse.ArgumentParser:
p.add_argument("--debug-delay", type=float, default=0.0, help="Sekunden Pause vor jedem Satz (simuliert langsame KI). Nur zum Testen.")
p.add_argument("--t3-model", type=str, default="v3", help="Multilingual T3-Modell: 'v3' (default), 'v2' oder Dateiname.")
p.add_argument("--no-conversation-mode", action="store_true", help="Ersten Chunk nicht künstlich kleiner machen (nur ohne --no-sentence-mode).")
p.add_argument("--stop", action="store_true", help="Globales Stop-Signal setzen (für Tests und Service-Integration).")
return p
@ -951,6 +986,11 @@ def main() -> int:
parser = build_argparser()
args = parser.parse_args()
if args.stop:
request_stop()
print("Stop-Signal gesetzt.")
return 0
try:
text = read_input_text(args.text, args.input)
device = get_device(args.device)
@ -970,6 +1010,8 @@ def main() -> int:
raise FileNotFoundError(f"Aussprache-Dict nicht gefunden: {pron_path}")
pronunciation_dict = json.loads(pron_path.read_text(encoding="utf-8"))
clear_stop()
if args.stream:
out = synthesize_streaming(
text=text,
@ -990,6 +1032,7 @@ def main() -> int:
save_wav=save_wav,
stream_chunk_size=args.stream_chunk_size,
audio_device=args.audio_device,
stop_event=STOP_REQUESTED,
)
else:
out = synthesize_non_streaming(
@ -1015,6 +1058,7 @@ def main() -> int:
debug_delay=args.debug_delay,
t3_model=args.t3_model,
pronunciation_dict=pronunciation_dict,
stop_event=STOP_REQUESTED,
)
if out is not None: