diff --git a/.gitignore b/.gitignore index 0c215f6..38953f4 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,7 @@ env/ # Claude Code .claude/ + +# Ideen +Ideen/ + diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..32e2c30 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,86 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Running the CLI + +```bash +conda activate chatterbox + +# Deutschen Text aus Datei vorlesen +python chatterbox_cli_v4.py --lang de --input text.txt + +# Mit Voice Cloning +python chatterbox_cli_v4.py --lang de --voice my_voice.wav --input text.txt + +# Text direkt übergeben (Englisch) +python chatterbox_cli_v4.py --lang en --text "Hello world" + +# Nur speichern, kein Playback +python chatterbox_cli_v4.py --lang de --no-play --output ausgabe.wav --input text.txt + +# Geschwindigkeit anpassen (pitch-erhaltend, erfordert rubberband-cli) +python chatterbox_cli_v4.py --lang de --speed 0.85 --input text.txt + +# Streaming-Modus (experimentell, niedrigere Latenz, kann abgehackt klingen) +python chatterbox_cli_v4.py --lang de --stream --input text.txt + +# Aussprache-Wörterbuch (JSON: {"Eigenname": "Lautschrift"}) +python chatterbox_cli_v4.py --lang de --pronunciation-dict aussprache.json --input text.txt +``` + +No build step, no test suite, no linter configuration — this is a single-file script. + +## Architecture + +Everything lives in `chatterbox_cli_v4.py`. The processing pipeline is: + +**Text input → normalization → chunking → TTS generation → audio output** + +### Text normalization (`preprocess_tts_text`) +Applied per chunk before synthesis. Order matters: +1. Pronunciation dict substitutions (before acronym expansion, so proper names are caught first) +2. Unit normalization (120 km/h → "120 Kilometer pro Stunde") +3. Time normalization (14:58 → "vierzehn Uhr achtundfünfzig") +4. Year normalization (2026 → "zweitausendsechsundzwanzig") +5. Acronym spelling (ARD → "Ah Er De"; skips entries in `NON_SPELLED_ACRONYMS`) + +`DEFAULT_PRONUNCIATION_DE` contains built-in German phonetic approximations (e.g. Xi → "Schi"). + +### Text chunking +Three modes (chosen by CLI flags): +- **sentence_mode** (default): `split_into_sentences()` — one sentence per TTS call, lowest latency to first audio +- **conversation_mode**: `split_for_conversation()` — first chunk is small (`--first-chunk-len`, default 80 chars), rest up to `--len` (400) +- **plain**: `split_long_text()` — paragraph-aware chunking up to `--len` + +`SENTENCE_END_RE` handles edge cases like ordinal numbers, ellipses, and CJK punctuation. `SEPARATOR_LINE_RE` silently drops lines like `--- Ende ---`. + +### Model loading (`load_model`) +- `--lang en` → `ChatterboxTTS` (mono, always available) +- Other languages → `ChatterboxMultilingualTTS` (requires multilingual package; `HAS_MULTILINGUAL` flag guards import) +- `--t3-model v3` (default) or `v2` selects the multilingual T3 checkpoint +- Models are downloaded to `~/.cache/huggingface/` on first use (~2–3 GB) +- **Critical**: `attn_implementation = "eager"` is forced at import time because SDPA returns `None` attention weights, breaking the `AlignmentStreamAnalyzer` hook + +### Audio output (`PlaybackWorker`) +- Uses `sounddevice.OutputStream` with a callback at 48 kHz (PipeWire/PulseAudio standard) +- Internal producer thread converts Torch tensors → `CALLBACK_BLOCK`-sized (2048 samples) numpy arrays +- If `--speed != 1.0`: pyrubberband R3-Engine (`--fine` flag) stretches time without pitch change before resampling +- Resampling: `torchaudio.functional.resample(chunk, model_sr, 48000)` +- `PlaybackWorker.stop()` sends `None` sentinel into the queue and joins the thread + +### Two synthesis paths +- **`synthesize_non_streaming`**: generates each chunk fully, feeds finished tensors to `PlaybackWorker`, concatenates all wavs for `--save` +- **`synthesize_streaming`**: calls `model.generate_stream()` with `chunk_size`; each yielded audio sub-chunk goes directly to `PlaybackWorker`; marked experimental in docs + +## Planned extensions (Ideen/) + +The `Ideen/` folder documents a planned **REST/MCP bridge**: +- `tts_service.py` (FastAPI): `POST /speak`, `POST /stop`, `GET /health`, `GET /voices` +- `mcp_adapter.py`: thin MCP wrapper calling the REST API +- `chatterbox_backend.py`: imports `chatterbox_cli_v4.py` via `importlib` and calls `synthesize_non_streaming()` directly + +Key gaps to address before building the service: +1. **Stop/interrupt**: `PlaybackWorker.stop()` drains the audio queue, but a blocking `model.generate()` call cannot be interrupted mid-run. A `threading.Event`-based cancel token threaded through `synthesize_non_streaming` is the planned approach. +2. **Model caching**: `load_model()` reloads from disk on every call; a service needs a per-language singleton. +3. **Status object**: progress is `print()`-based; a service needs structured state. diff --git a/chatterbox_cli_v4.py b/chatterbox_cli_v4.py index 14c2a64..4ffc32d 100755 --- a/chatterbox_cli_v4.py +++ b/chatterbox_cli_v4.py @@ -10,6 +10,20 @@ import time from pathlib import Path from typing import List, Optional, Tuple +# --------------------------------------------------------------------------- +# Kooperativer Stop-Mechanismus +# --------------------------------------------------------------------------- +STOP_REQUESTED = threading.Event() + +def request_stop() -> None: + STOP_REQUESTED.set() + +def clear_stop() -> None: + STOP_REQUESTED.clear() + +def stop_requested() -> bool: + return STOP_REQUESTED.is_set() + import torch import torchaudio as ta @@ -556,10 +570,12 @@ class PlaybackWorker: PLAYBACK_RATE = 48000 # PipeWire/PulseAudio standard CALLBACK_BLOCK = 2048 # ~43 ms pro Callback-Block bei 48 kHz - def __init__(self, sample_rate: int, device: Optional[str] = "pulse", speed: float = 1.0): + def __init__(self, sample_rate: int, device: Optional[str] = "pulse", speed: float = 1.0, + stop_event: Optional[threading.Event] = None): self.sample_rate = sample_rate self.device = device self.speed = speed + self.stop_event = stop_event # Eingang: Torch-Tensoren vom TTS-Modell self.audio_queue: "queue.Queue[Optional[torch.Tensor]]" = queue.Queue() # Intern: fertig vorbereitete numpy-Blöcke für den Callback @@ -579,6 +595,9 @@ class PlaybackWorker: def _callback(self, outdata, frames, time_info, status): # Läuft im Audio-Thread: so schnell wie möglich, kein Lock nötig. + if self.stop_event and self.stop_event.is_set(): + outdata[:] = 0.0 + return try: data = self._block_queue.get_nowait() outdata[:, 0] = data @@ -593,6 +612,8 @@ class PlaybackWorker: remainder = np.zeros(0, dtype="float32") while True: + if self.stop_event and self.stop_event.is_set(): + break item = self.audio_queue.get() if item is None: break @@ -690,6 +711,7 @@ def synthesize_non_streaming( debug_delay: float = 0.0, t3_model: Optional[str] = None, pronunciation_dict: Optional[dict] = None, + stop_event: Optional[threading.Event] = None, ) -> Optional[Path]: if lang not in SUPPORTED_LANGS: raise ValueError( @@ -739,7 +761,8 @@ def synthesize_non_streaming( print(f"Ausgabe: {output_path}") if play_audio: - playback = PlaybackWorker(sample_rate=sr, device=audio_device, speed=speed) + playback = PlaybackWorker(sample_rate=sr, device=audio_device, speed=speed, + stop_event=stop_event) playback.start() else: playback = None @@ -747,6 +770,10 @@ def synthesize_non_streaming( wavs = [] try: for i, chunk in enumerate(chunks, start=1): + if stop_event and stop_event.is_set(): + if show_progress: + print("Abbruch angefordert – Synthese gestoppt.") + break if debug_delay > 0: if show_progress: print(f"[{i}/{len(chunks)}] Warte {debug_delay:.0f}s (debug_delay) ...") @@ -793,6 +820,7 @@ def synthesize_streaming( save_wav: bool = True, stream_chunk_size: int = 25, audio_device: Optional[str] = None, + stop_event: Optional[threading.Event] = None, ) -> Optional[Path]: if lang not in SUPPORTED_LANGS: raise ValueError( @@ -829,7 +857,7 @@ def synthesize_streaming( raise ValueError("Kein verwertbarer Text nach dem Einlesen gefunden.") if play_audio: - playback = PlaybackWorker(sample_rate=sr, device=audio_device) + playback = PlaybackWorker(sample_rate=sr, device=audio_device, stop_event=stop_event) playback.start() else: playback = None @@ -853,6 +881,10 @@ def synthesize_streaming( try: for text_idx, text_chunk in enumerate(text_chunks, start=1): + if stop_event and stop_event.is_set(): + if show_progress: + print("Abbruch angefordert – Streaming gestoppt.") + break if show_progress: print(f"[Text {text_idx}/{len(text_chunks)}] Starte Streaming für {len(text_chunk)} Zeichen ...") @@ -866,6 +898,8 @@ def synthesize_streaming( ) for audio_idx, item in enumerate(stream_iter, start=1): + if stop_event and stop_event.is_set(): + break if isinstance(item, tuple) and len(item) == 2: audio_chunk, metrics = item else: @@ -944,6 +978,7 @@ def build_argparser() -> argparse.ArgumentParser: p.add_argument("--debug-delay", type=float, default=0.0, help="Sekunden Pause vor jedem Satz (simuliert langsame KI). Nur zum Testen.") p.add_argument("--t3-model", type=str, default="v3", help="Multilingual T3-Modell: 'v3' (default), 'v2' oder Dateiname.") p.add_argument("--no-conversation-mode", action="store_true", help="Ersten Chunk nicht künstlich kleiner machen (nur ohne --no-sentence-mode).") + p.add_argument("--stop", action="store_true", help="Globales Stop-Signal setzen (für Tests und Service-Integration).") return p @@ -951,6 +986,11 @@ def main() -> int: parser = build_argparser() args = parser.parse_args() + if args.stop: + request_stop() + print("Stop-Signal gesetzt.") + return 0 + try: text = read_input_text(args.text, args.input) device = get_device(args.device) @@ -970,6 +1010,8 @@ def main() -> int: raise FileNotFoundError(f"Aussprache-Dict nicht gefunden: {pron_path}") pronunciation_dict = json.loads(pron_path.read_text(encoding="utf-8")) + clear_stop() + if args.stream: out = synthesize_streaming( text=text, @@ -990,6 +1032,7 @@ def main() -> int: save_wav=save_wav, stream_chunk_size=args.stream_chunk_size, audio_device=args.audio_device, + stop_event=STOP_REQUESTED, ) else: out = synthesize_non_streaming( @@ -1015,6 +1058,7 @@ def main() -> int: debug_delay=args.debug_delay, t3_model=args.t3_model, pronunciation_dict=pronunciation_dict, + stop_event=STOP_REQUESTED, ) if out is not None: diff --git a/mcp_adapter.py b/mcp_adapter.py new file mode 100644 index 0000000..ca1562e --- /dev/null +++ b/mcp_adapter.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +Chatterbox TTS – MCP-Adapter + +Setzt einen laufenden tts_service.py voraus (Standard: http://127.0.0.1:8000). + +Start (streamable-http, Port 8001 – für beliebige MCP-Clients): + python mcp_adapter.py + +Start (stdio – für Claude Code / Claude Desktop): + python mcp_adapter.py --stdio + +Claude Code Konfiguration (.claude/settings.json): + { + "mcpServers": { + "chatterbox-tts": { + "command": "python", + "args": ["/home/dschlueter/chatterbox-tts-cli/mcp_adapter.py", "--stdio"] + } + } + } + +Umgebungsvariable TTS_URL überschreibt die Service-Adresse: + TTS_URL=http://192.168.1.10:8000 python mcp_adapter.py --stdio +""" +from __future__ import annotations + +import argparse +import os + +import httpx +from mcp.server.fastmcp import FastMCP + +TTS_URL = os.environ.get("TTS_URL", "http://127.0.0.1:8000").rstrip("/") + +mcp = FastMCP( + "Chatterbox TTS", + instructions=( + "Lokaler Text-to-Speech-Service. Liest Texte auf Deutsch und 20+ weiteren " + "Sprachen vor. Unterstützt Voice Cloning, Geschwindigkeitsanpassung und " + "Aussprache-Wörterbücher." + ), + port=8001, +) + + +# --------------------------------------------------------------------------- +# Tools +# --------------------------------------------------------------------------- + +@mcp.tool() +async def speak( + text: str, + lang: str = "de", + voice: str | None = None, + interrupt: bool = False, + speed: float = 1.0, +) -> dict: + """Text als Sprache ausgeben. + + Reiht den Text in die Ausgabewarteschlange ein. Das Modell generiert + satzweise und beginnt sofort mit der Wiedergabe. + + Args: + text: Auszugebender Text (max. 4000 Zeichen). + lang: Sprachcode, z. B. 'de', 'en', 'fr'. Standard: 'de'. + voice: Optionaler Pfad zu einer WAV-Referenzdatei (10–30s) für + Voice Cloning. + interrupt: True = laufende Ausgabe sofort unterbrechen und diesen + Text vorgezogen abspielen. + speed: Wiedergabegeschwindigkeit (0.5–2.0). Pitch bleibt gleich. + """ + async with httpx.AsyncClient(timeout=15) as client: + r = await client.post(f"{TTS_URL}/speak", json={ + "text": text, + "lang": lang, + "voice": voice, + "interrupt": interrupt, + "speed": speed, + }) + r.raise_for_status() + return r.json() + + +@mcp.tool() +async def stop() -> dict: + """Laufende Sprachausgabe sofort stoppen und Warteschlange leeren.""" + async with httpx.AsyncClient(timeout=5) as client: + r = await client.post(f"{TTS_URL}/stop") + r.raise_for_status() + return r.json() + + +@mcp.tool() +async def get_status() -> dict: + """Aktuellen Ausgabe-Status abfragen. + + Gibt zurück: laufender Job (mit Chunk-Fortschritt), Queue-Länge und + die letzten abgeschlossenen Jobs. + """ + async with httpx.AsyncClient(timeout=5) as client: + r = await client.get(f"{TTS_URL}/status") + r.raise_for_status() + return r.json() + + +@mcp.tool() +async def list_voices() -> dict: + """Unterstützte Sprachen und Hinweise zu Voice Cloning abfragen.""" + async with httpx.AsyncClient(timeout=5) as client: + r = await client.get(f"{TTS_URL}/voices") + r.raise_for_status() + return r.json() + + +# --------------------------------------------------------------------------- +# Einstiegspunkt +# --------------------------------------------------------------------------- +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Chatterbox TTS MCP-Adapter") + parser.add_argument( + "--stdio", action="store_true", + help="stdio-Transport (für Claude Code / Claude Desktop)", + ) + parser.add_argument("--host", default="127.0.0.1", + help="Host für streamable-http (Standard: 127.0.0.1)") + parser.add_argument("--port", type=int, default=8001, + help="Port für streamable-http (Standard: 8001)") + args = parser.parse_args() + + if args.stdio: + mcp.run() # stdio ist der Default-Transport + else: + mcp.run(transport="streamable-http", host=args.host, port=args.port) diff --git a/requirements.txt b/requirements.txt index 6408a4d..3b709db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,13 @@ sounddevice>=0.4.0 pyrubberband>=0.4.0 # rubberband-cli muss zusätzlich als Systempakete installiert sein: # sudo apt install rubberband-cli + +# HTTP-Service (Phase 2) +fastapi>=0.115.0 +uvicorn[standard]>=0.32.0 + +# HTTP-Client für MCP-Adapter (Phase 3) +httpx>=0.28.0 + +# MCP-Adapter (Phase 3) +mcp>=1.0.0 diff --git a/tts_service.py b/tts_service.py new file mode 100644 index 0000000..0b2a6a5 --- /dev/null +++ b/tts_service.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +""" +Chatterbox TTS – lokaler HTTP-Service + +Start: + uvicorn tts_service:app --host 127.0.0.1 --port 8000 + +Endpunkte: + POST /speak – Text in Warteschlange einreihen + POST /stop – laufende Ausgabe abbrechen, Queue leeren + GET /health – Service-Status + GET /status – aktueller Job + Queue-Länge + GET /voices – unterstützte Sprachen +""" +from __future__ import annotations + +import queue +import sys +import threading +import uuid +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Optional + +# CLI-Modul aus demselben Verzeichnis laden +sys.path.insert(0, str(Path(__file__).parent)) +import chatterbox_cli_v4 as tts # noqa: E402 + +import torch +import torchaudio as ta +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field + +# --------------------------------------------------------------------------- +# Gerät einmalig bestimmen +# --------------------------------------------------------------------------- +_DEVICE = tts.get_device(None) + +# --------------------------------------------------------------------------- +# Modell-Cache (lang, t3_model) → (model, model_kind, sr) +# --------------------------------------------------------------------------- +_model_cache: dict[tuple, tuple] = {} +_model_lock = threading.Lock() + + +def _get_or_load_model(lang: str, t3_model: str) -> tuple: + key = (lang, t3_model) + with _model_lock: + if key not in _model_cache: + _model_cache[key] = tts.load_model(lang, _DEVICE, t3_model=t3_model) + return _model_cache[key] + + +# --------------------------------------------------------------------------- +# Job-Datenmodell +# --------------------------------------------------------------------------- +class JobStatus(str, Enum): + pending = "pending" + running = "running" + done = "done" + cancelled = "cancelled" + error = "error" + + +@dataclass +class SpeakJob: + id: str + text: str + lang: str + t3_model: str + voice: Optional[str] + speed: float + audio_device: str + max_len: int + save_wav: bool + output_path: Optional[str] + pronunciation_dict: Optional[dict] + session_id: Optional[str] + status: JobStatus = field(default=JobStatus.pending) + text_preview: str = field(default="") + chunks_total: int = 0 + chunks_done: int = 0 + error: Optional[str] = None + + +# --------------------------------------------------------------------------- +# Worker-Thread +# --------------------------------------------------------------------------- +_job_queue: queue.Queue[SpeakJob] = queue.Queue() +_current_job: Optional[SpeakJob] = None +_state_lock = threading.Lock() +_recent_jobs: list[SpeakJob] = [] +_MAX_RECENT = 20 + + +def _worker() -> None: + global _current_job + + while True: + job = _job_queue.get() + + with _state_lock: + _current_job = job + job.status = JobStatus.running + + tts.clear_stop() + + try: + model, model_kind, sr = _get_or_load_model(job.lang, job.t3_model) + + raw = tts.clean_raw_text(job.text) + raw_chunks = tts.split_into_sentences(raw, max_len=job.max_len) + chunks = [ + tts.preprocess_tts_text(c, lang=job.lang, + pronunciation_dict=job.pronunciation_dict) + for c in raw_chunks + ] + chunks = [c for c in chunks if c.strip()] + + job.chunks_total = len(chunks) + job.text_preview = job.text[:80] + + playback = tts.PlaybackWorker( + sample_rate=sr, + device=job.audio_device, + speed=job.speed, + stop_event=tts.STOP_REQUESTED, + ) + playback.start() + + wavs: list[torch.Tensor] = [] + try: + for chunk in chunks: + if tts.stop_requested(): + break + wav = tts.generate_chunk(model, model_kind, chunk, job.lang, job.voice) + wavs.append(wav) + playback.put(wav) + job.chunks_done += 1 + finally: + playback.stop() + + if job.save_wav and job.output_path and wavs: + out = Path(job.output_path) + out.parent.mkdir(parents=True, exist_ok=True) + final = wavs[0] if len(wavs) == 1 else torch.cat(wavs, dim=-1) + ta.save(str(out), final, sr) + + job.status = ( + JobStatus.cancelled if tts.stop_requested() else JobStatus.done + ) + + except Exception as exc: # noqa: BLE001 + job.status = JobStatus.error + job.error = str(exc) + + finally: + with _state_lock: + _current_job = None + _recent_jobs.append(job) + if len(_recent_jobs) > _MAX_RECENT: + _recent_jobs.pop(0) + _job_queue.task_done() + + +_worker_thread = threading.Thread(target=_worker, daemon=True, name="tts-worker") +_worker_thread.start() + + +# --------------------------------------------------------------------------- +# API-Modelle +# --------------------------------------------------------------------------- +class SpeakRequest(BaseModel): + text: str = Field(min_length=1, max_length=4000) + lang: str = "de" + voice: Optional[str] = None + interrupt: bool = False + speed: float = Field(default=1.0, ge=0.5, le=2.0) + t3_model: str = "v3" + audio_device: str = "pulse" + max_len: int = Field(default=400, ge=50, le=1000) + save_wav: bool = False + output_path: Optional[str] = None + session_id: Optional[str] = None + pronunciation_dict: Optional[dict] = None + + +def _job_to_dict(j: SpeakJob) -> dict: + return { + "id": j.id, + "status": j.status, + "lang": j.lang, + "text_preview": j.text_preview, + "chunks_total": j.chunks_total, + "chunks_done": j.chunks_done, + "error": j.error, + } + + +def _drain_queue() -> None: + while not _job_queue.empty(): + try: + _job_queue.get_nowait() + _job_queue.task_done() + except queue.Empty: + break + + +# --------------------------------------------------------------------------- +# FastAPI-App +# --------------------------------------------------------------------------- +app = FastAPI(title="Chatterbox TTS Service", version="1.0") + + +@app.get("/health") +def health(): + return {"status": "ok", "device": _DEVICE} + + +@app.get("/voices") +def voices(): + return { + "languages": sorted(tts.SUPPORTED_LANGS), + "note": "Voice cloning via 'voice' field (WAV-Pfad, 10–30s Aufnahme)", + } + + +@app.post("/speak") +def speak(req: SpeakRequest): + if req.lang not in tts.SUPPORTED_LANGS: + raise HTTPException(status_code=422, + detail=f"Sprache nicht unterstützt: {req.lang}") + if req.voice and not Path(req.voice).exists(): + raise HTTPException(status_code=422, + detail=f"Voice-Datei nicht gefunden: {req.voice}") + + if req.interrupt: + tts.request_stop() + _drain_queue() + + job = SpeakJob( + id=str(uuid.uuid4()), + text=req.text, + lang=req.lang, + t3_model=req.t3_model, + voice=req.voice, + speed=req.speed, + audio_device=req.audio_device, + max_len=req.max_len, + save_wav=req.save_wav, + output_path=req.output_path, + pronunciation_dict=req.pronunciation_dict, + session_id=req.session_id, + ) + _job_queue.put(job) + + return { + "job_id": job.id, + "status": job.status, + "queue_position": _job_queue.qsize(), + } + + +@app.post("/stop") +def stop(): + tts.request_stop() + _drain_queue() + return {"stopped": True} + + +@app.get("/status") +def status(): + with _state_lock: + cur = _current_job + recent = list(_recent_jobs) + + return { + "current_job": _job_to_dict(cur) if cur else None, + "queue_length": _job_queue.qsize(), + "recent_jobs": [_job_to_dict(j) for j in reversed(recent)], + }