Erweiterung: Stop-Mechanismus, REST-Service und MCP-Adapter
- chatterbox_cli_v4.py: kooperativer Stop-Mechanismus via threading.Event (STOP_REQUESTED, request_stop, clear_stop); PlaybackWorker, synthesize_non_streaming und synthesize_streaming prüfen das Event vor jedem Chunk; --stop CLI-Flag - tts_service.py: FastAPI-Service mit Modell-Caching, Job-Queue und Worker-Thread; Endpunkte: POST /speak, POST /stop, GET /health, GET /status, GET /voices - mcp_adapter.py: MCP-Adapter (stdio/streamable-http) über tts_service; Tools: speak, stop, get_status, list_voices - requirements.txt: fastapi, uvicorn, httpx, mcp ergänzt - CLAUDE.md: Architektur und Startbefehle dokumentiert - .gitignore: Ideen/-Verzeichnis ausgeschlossen Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
bed29fb1c8
commit
bcf6374c29
6 changed files with 563 additions and 3 deletions
|
|
@ -10,6 +10,20 @@ import time
|
|||
from pathlib import Path
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Kooperativer Stop-Mechanismus
|
||||
# ---------------------------------------------------------------------------
|
||||
STOP_REQUESTED = threading.Event()
|
||||
|
||||
def request_stop() -> None:
|
||||
STOP_REQUESTED.set()
|
||||
|
||||
def clear_stop() -> None:
|
||||
STOP_REQUESTED.clear()
|
||||
|
||||
def stop_requested() -> bool:
|
||||
return STOP_REQUESTED.is_set()
|
||||
|
||||
import torch
|
||||
import torchaudio as ta
|
||||
|
||||
|
|
@ -556,10 +570,12 @@ class PlaybackWorker:
|
|||
PLAYBACK_RATE = 48000 # PipeWire/PulseAudio standard
|
||||
CALLBACK_BLOCK = 2048 # ~43 ms pro Callback-Block bei 48 kHz
|
||||
|
||||
def __init__(self, sample_rate: int, device: Optional[str] = "pulse", speed: float = 1.0):
|
||||
def __init__(self, sample_rate: int, device: Optional[str] = "pulse", speed: float = 1.0,
|
||||
stop_event: Optional[threading.Event] = None):
|
||||
self.sample_rate = sample_rate
|
||||
self.device = device
|
||||
self.speed = speed
|
||||
self.stop_event = stop_event
|
||||
# Eingang: Torch-Tensoren vom TTS-Modell
|
||||
self.audio_queue: "queue.Queue[Optional[torch.Tensor]]" = queue.Queue()
|
||||
# Intern: fertig vorbereitete numpy-Blöcke für den Callback
|
||||
|
|
@ -579,6 +595,9 @@ class PlaybackWorker:
|
|||
|
||||
def _callback(self, outdata, frames, time_info, status):
|
||||
# Läuft im Audio-Thread: so schnell wie möglich, kein Lock nötig.
|
||||
if self.stop_event and self.stop_event.is_set():
|
||||
outdata[:] = 0.0
|
||||
return
|
||||
try:
|
||||
data = self._block_queue.get_nowait()
|
||||
outdata[:, 0] = data
|
||||
|
|
@ -593,6 +612,8 @@ class PlaybackWorker:
|
|||
remainder = np.zeros(0, dtype="float32")
|
||||
|
||||
while True:
|
||||
if self.stop_event and self.stop_event.is_set():
|
||||
break
|
||||
item = self.audio_queue.get()
|
||||
if item is None:
|
||||
break
|
||||
|
|
@ -690,6 +711,7 @@ def synthesize_non_streaming(
|
|||
debug_delay: float = 0.0,
|
||||
t3_model: Optional[str] = None,
|
||||
pronunciation_dict: Optional[dict] = None,
|
||||
stop_event: Optional[threading.Event] = None,
|
||||
) -> Optional[Path]:
|
||||
if lang not in SUPPORTED_LANGS:
|
||||
raise ValueError(
|
||||
|
|
@ -739,7 +761,8 @@ def synthesize_non_streaming(
|
|||
print(f"Ausgabe: {output_path}")
|
||||
|
||||
if play_audio:
|
||||
playback = PlaybackWorker(sample_rate=sr, device=audio_device, speed=speed)
|
||||
playback = PlaybackWorker(sample_rate=sr, device=audio_device, speed=speed,
|
||||
stop_event=stop_event)
|
||||
playback.start()
|
||||
else:
|
||||
playback = None
|
||||
|
|
@ -747,6 +770,10 @@ def synthesize_non_streaming(
|
|||
wavs = []
|
||||
try:
|
||||
for i, chunk in enumerate(chunks, start=1):
|
||||
if stop_event and stop_event.is_set():
|
||||
if show_progress:
|
||||
print("Abbruch angefordert – Synthese gestoppt.")
|
||||
break
|
||||
if debug_delay > 0:
|
||||
if show_progress:
|
||||
print(f"[{i}/{len(chunks)}] Warte {debug_delay:.0f}s (debug_delay) ...")
|
||||
|
|
@ -793,6 +820,7 @@ def synthesize_streaming(
|
|||
save_wav: bool = True,
|
||||
stream_chunk_size: int = 25,
|
||||
audio_device: Optional[str] = None,
|
||||
stop_event: Optional[threading.Event] = None,
|
||||
) -> Optional[Path]:
|
||||
if lang not in SUPPORTED_LANGS:
|
||||
raise ValueError(
|
||||
|
|
@ -829,7 +857,7 @@ def synthesize_streaming(
|
|||
raise ValueError("Kein verwertbarer Text nach dem Einlesen gefunden.")
|
||||
|
||||
if play_audio:
|
||||
playback = PlaybackWorker(sample_rate=sr, device=audio_device)
|
||||
playback = PlaybackWorker(sample_rate=sr, device=audio_device, stop_event=stop_event)
|
||||
playback.start()
|
||||
else:
|
||||
playback = None
|
||||
|
|
@ -853,6 +881,10 @@ def synthesize_streaming(
|
|||
|
||||
try:
|
||||
for text_idx, text_chunk in enumerate(text_chunks, start=1):
|
||||
if stop_event and stop_event.is_set():
|
||||
if show_progress:
|
||||
print("Abbruch angefordert – Streaming gestoppt.")
|
||||
break
|
||||
if show_progress:
|
||||
print(f"[Text {text_idx}/{len(text_chunks)}] Starte Streaming für {len(text_chunk)} Zeichen ...")
|
||||
|
||||
|
|
@ -866,6 +898,8 @@ def synthesize_streaming(
|
|||
)
|
||||
|
||||
for audio_idx, item in enumerate(stream_iter, start=1):
|
||||
if stop_event and stop_event.is_set():
|
||||
break
|
||||
if isinstance(item, tuple) and len(item) == 2:
|
||||
audio_chunk, metrics = item
|
||||
else:
|
||||
|
|
@ -944,6 +978,7 @@ def build_argparser() -> argparse.ArgumentParser:
|
|||
p.add_argument("--debug-delay", type=float, default=0.0, help="Sekunden Pause vor jedem Satz (simuliert langsame KI). Nur zum Testen.")
|
||||
p.add_argument("--t3-model", type=str, default="v3", help="Multilingual T3-Modell: 'v3' (default), 'v2' oder Dateiname.")
|
||||
p.add_argument("--no-conversation-mode", action="store_true", help="Ersten Chunk nicht künstlich kleiner machen (nur ohne --no-sentence-mode).")
|
||||
p.add_argument("--stop", action="store_true", help="Globales Stop-Signal setzen (für Tests und Service-Integration).")
|
||||
return p
|
||||
|
||||
|
||||
|
|
@ -951,6 +986,11 @@ def main() -> int:
|
|||
parser = build_argparser()
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.stop:
|
||||
request_stop()
|
||||
print("Stop-Signal gesetzt.")
|
||||
return 0
|
||||
|
||||
try:
|
||||
text = read_input_text(args.text, args.input)
|
||||
device = get_device(args.device)
|
||||
|
|
@ -970,6 +1010,8 @@ def main() -> int:
|
|||
raise FileNotFoundError(f"Aussprache-Dict nicht gefunden: {pron_path}")
|
||||
pronunciation_dict = json.loads(pron_path.read_text(encoding="utf-8"))
|
||||
|
||||
clear_stop()
|
||||
|
||||
if args.stream:
|
||||
out = synthesize_streaming(
|
||||
text=text,
|
||||
|
|
@ -990,6 +1032,7 @@ def main() -> int:
|
|||
save_wav=save_wav,
|
||||
stream_chunk_size=args.stream_chunk_size,
|
||||
audio_device=args.audio_device,
|
||||
stop_event=STOP_REQUESTED,
|
||||
)
|
||||
else:
|
||||
out = synthesize_non_streaming(
|
||||
|
|
@ -1015,6 +1058,7 @@ def main() -> int:
|
|||
debug_delay=args.debug_delay,
|
||||
t3_model=args.t3_model,
|
||||
pronunciation_dict=pronunciation_dict,
|
||||
stop_event=STOP_REQUESTED,
|
||||
)
|
||||
|
||||
if out is not None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue