Bugfixes, Verbesserungen und Mixed-Language-Support
Bugfixes: - Abkürzungen (z.B., d.h., Dr., Prof.) werden nicht mehr als Satzenden erkannt (_ABBREV_MASK_RE) - Multilingual-Import: except Exception → except (ImportError, ModuleNotFoundError) - tts_agent: ReAct-Schleife auf max. 10 Iterationen begrenzt, model_dump → explizites Dict - tts_service: audio_device=None fällt auf 'pulse' zurück - JSON-Fehlerbehandlung für --pronunciation-dict mit aussagekräftiger Meldung - PlaybackWorker: Audio-Device wird vor Stream-Start via sd.query_devices() geprüft - mcp_adapter: Fehlerbehandlung für HTTP-Fehler, Timeout erhöht, session_id ergänzt - tts_agent: Health-Check beim Start, --speed/--first-chunk-len Validierung Neue Features: - Gemischtsprachige Texte: [en]...[/en]-Markierungen für per-Segment language_id - strip_markdown(): entfernt Markdown-Formatierung vor der Synthese (--no-strip-markdown) - Emoji-Entfernung in clean_raw_text() via unicodedata - Pause/Resume: request_pause()/request_resume(), POST /pause, POST /resume, MCP-Tools - Neue Einheiten: °C, °F, kWh, kW, W, V, A, J, kPa, bar, m², m³, m/s, rpm - number_to_words_de/en bis Milliarden - DEFAULT_PRONUNCIATION_DE erweitert (GitHub, YouTube, LinkedIn, Wi-Fi, iPhone, ChatGPT, …) - NON_SPELLED_ACRONYMS erweitert (USB, CPU, GPU, API, CEO, HTML, …) - Nummerierte Listen als separate Chunks behandelt - Modell-Warmup via TTS_PRELOAD_LANG Env-Variable - requirements.txt: Upper-Bounds für fastapi und uvicorn Dokumentation: CLAUDE.md, README.md, BEDIENUNGSANLEITUNG.md vollständig aktualisiert Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d1971049ce
commit
34a34907a8
8 changed files with 778 additions and 114 deletions
237
tts_agent.py
Normal file
237
tts_agent.py
Normal file
|
|
@ -0,0 +1,237 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
TTS Agent — conversational agent with speak/stop/status tools.
|
||||
|
||||
Works with any OpenAI-compatible LLM backend:
|
||||
- Ollama: python tts_agent.py --model qwen2.5
|
||||
- LM Studio: python tts_agent.py --base-url http://localhost:1234/v1 --model local-model
|
||||
- OpenAI: OPENAI_API_KEY=sk-... python tts_agent.py --model gpt-4o
|
||||
|
||||
The agent automatically calls speak() when it should read text aloud.
|
||||
Set TTS_URL to override the default TTS service address.
|
||||
|
||||
Pi (Inflection AI) is not supported — it has no public API or function calling.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
import httpx
|
||||
|
||||
TTS_URL = os.environ.get("TTS_URL", "http://127.0.0.1:9999").rstrip("/")
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein hilfreicher Sprachassistent.
|
||||
Du hast Zugriff auf einen Text-to-Speech-Service mit folgenden Tools:
|
||||
- speak(text, lang, speed, interrupt): Text vorlesen lassen
|
||||
- stop(): laufende Ausgabe stoppen
|
||||
- get_status(): Ausgabe-Status abfragen
|
||||
|
||||
Wenn der Nutzer darum bittet, etwas vorzulesen, oder wenn du eine längere Antwort
|
||||
gibst, die zum Vorlesen geeignet ist, ruf speak() mit dem entsprechenden Text auf.
|
||||
Antworte auf Deutsch, sofern nicht anders gewünscht."""
|
||||
|
||||
TOOLS = [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "speak",
|
||||
"description": (
|
||||
"Text als Sprache ausgeben. Reiht den Text in die Warteschlange ein "
|
||||
"und gibt sofort zurück. Ideal für längere Texte oder wenn der Nutzer "
|
||||
"etwas vorgelesen haben möchte."
|
||||
),
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "Auszugebender Text (max. 4000 Zeichen).",
|
||||
},
|
||||
"lang": {
|
||||
"type": "string",
|
||||
"default": "de",
|
||||
"description": "Sprachcode: 'de', 'en', 'fr', 'es', … Standard: 'de'.",
|
||||
},
|
||||
"speed": {
|
||||
"type": "number",
|
||||
"default": 1.0,
|
||||
"description": "Geschwindigkeit 0.5–2.0. Standard: 1.0.",
|
||||
},
|
||||
"interrupt": {
|
||||
"type": "boolean",
|
||||
"default": False,
|
||||
"description": "True = laufende Ausgabe sofort unterbrechen.",
|
||||
},
|
||||
},
|
||||
"required": ["text"],
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "stop",
|
||||
"description": "Laufende Sprachausgabe sofort stoppen und Warteschlange leeren.",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "get_status",
|
||||
"description": "Aktuellen Ausgabe-Status abfragen (laufender Job, Queue-Länge).",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def _call_tts(name: str, args: dict) -> str:
|
||||
try:
|
||||
with httpx.Client(timeout=15) as client:
|
||||
if name == "speak":
|
||||
r = client.post(f"{TTS_URL}/speak", json=args)
|
||||
elif name == "stop":
|
||||
r = client.post(f"{TTS_URL}/stop")
|
||||
elif name == "get_status":
|
||||
r = client.get(f"{TTS_URL}/status")
|
||||
else:
|
||||
return json.dumps({"error": f"Unbekanntes Tool: {name}"})
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except httpx.ConnectError:
|
||||
return json.dumps({"error": f"TTS-Service nicht erreichbar: {TTS_URL}"})
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return json.dumps({"error": str(exc)})
|
||||
|
||||
|
||||
def run_agent(model: str, base_url: str, system_prompt: str, voice: str | None) -> None:
|
||||
try:
|
||||
from openai import OpenAI
|
||||
except ImportError:
|
||||
print("Fehler: 'openai' nicht installiert. → pip install openai", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
client = OpenAI(
|
||||
base_url=base_url,
|
||||
api_key=os.environ.get("OPENAI_API_KEY", "ollama"),
|
||||
)
|
||||
|
||||
if voice:
|
||||
system_prompt += f"\n\nWenn du speak() aufrufst, übergib immer voice='{voice}'."
|
||||
|
||||
messages: list[dict] = [{"role": "system", "content": system_prompt}]
|
||||
|
||||
# IMPROVE-7: TTS-Service-Erreichbarkeit früh prüfen
|
||||
try:
|
||||
with httpx.Client(timeout=5) as _hc:
|
||||
_hc.get(f"{TTS_URL}/health")
|
||||
except Exception as _e:
|
||||
print(f"Fehler: TTS-Service nicht erreichbar ({TTS_URL}): {_e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Agent gestartet | Modell: {model} | TTS: {TTS_URL}")
|
||||
print("Beenden mit 'exit' oder Ctrl+C.\n")
|
||||
|
||||
while True:
|
||||
try:
|
||||
user_input = input("Du: ").strip()
|
||||
except (KeyboardInterrupt, EOFError):
|
||||
print("\nTschüss!")
|
||||
break
|
||||
|
||||
if user_input.lower() in ("exit", "quit", "bye", "tschüss"):
|
||||
print("Tschüss!")
|
||||
break
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
messages.append({"role": "user", "content": user_input})
|
||||
|
||||
# BUG-3: Maximale Iterationen verhindern Endlosschleife bei LLMs,
|
||||
# die wiederholt Tool-Calls ohne abschließende Antwort produzieren.
|
||||
max_iterations = 10
|
||||
for _iteration in range(max_iterations):
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
tools=TOOLS,
|
||||
tool_choice="auto",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
print(f"[LLM-Fehler] {exc}", file=sys.stderr)
|
||||
break
|
||||
|
||||
msg = resp.choices[0].message
|
||||
# BUG-6: explizites Dict statt model_dump(exclude_unset=True),
|
||||
# damit 'role' auch bei strict-kompatiblen Backends immer vorhanden ist.
|
||||
msg_dict: dict = {"role": msg.role, "content": msg.content}
|
||||
if msg.tool_calls:
|
||||
msg_dict["tool_calls"] = [tc.model_dump() for tc in msg.tool_calls]
|
||||
messages.append(msg_dict)
|
||||
|
||||
if not msg.tool_calls:
|
||||
if msg.content:
|
||||
print(f"\nAssistent: {msg.content}\n")
|
||||
break
|
||||
|
||||
for tc in msg.tool_calls:
|
||||
args = json.loads(tc.function.arguments)
|
||||
arg_str = ", ".join(f"{k}={v!r}" for k, v in args.items())
|
||||
print(f" [{tc.function.name}({arg_str})]")
|
||||
result = _call_tts(tc.function.name, args)
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc.id,
|
||||
"content": result,
|
||||
})
|
||||
else:
|
||||
print(f"[Agent] Maximale Tool-Call-Iterationen ({max_iterations}) erreicht.", file=sys.stderr)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser(
|
||||
description="TTS Agent — LLM mit speak/stop/status Tools",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__,
|
||||
)
|
||||
p.add_argument(
|
||||
"--model", default="qwen2.5",
|
||||
help="Modellname (Standard: qwen2.5). Für OpenAI z. B. 'gpt-4o'.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--base-url", default="http://localhost:11434/v1",
|
||||
help="OpenAI-kompatibler API-Endpunkt (Standard: Ollama auf Port 11434).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--system-prompt", default=SYSTEM_PROMPT,
|
||||
help="System-Prompt überschreiben.",
|
||||
)
|
||||
p.add_argument(
|
||||
"--voice", default=None,
|
||||
help="Pfad zu einer WAV-Referenzdatei für Voice Cloning (wird an speak() weitergegeben).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--lang", default=None,
|
||||
help="Sprache für speak() überschreiben (z. B. 'en'). Standard: Modell entscheidet.",
|
||||
)
|
||||
args = p.parse_args()
|
||||
|
||||
system = args.system_prompt
|
||||
if args.lang:
|
||||
system += f"\n\nVerwende immer lang='{args.lang}' beim Aufruf von speak()."
|
||||
|
||||
run_agent(
|
||||
model=args.model,
|
||||
base_url=args.base_url,
|
||||
system_prompt=system,
|
||||
voice=args.voice,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue