237 lines
8.1 KiB
Python
237 lines
8.1 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
TTS Agent — conversational agent with speak/stop/status tools.
|
|||
|
|
|
|||
|
|
Works with any OpenAI-compatible LLM backend:
|
|||
|
|
- Ollama: python tts_agent.py --model qwen2.5
|
|||
|
|
- LM Studio: python tts_agent.py --base-url http://localhost:1234/v1 --model local-model
|
|||
|
|
- OpenAI: OPENAI_API_KEY=sk-... python tts_agent.py --model gpt-4o
|
|||
|
|
|
|||
|
|
The agent automatically calls speak() when it should read text aloud.
|
|||
|
|
Set TTS_URL to override the default TTS service address.
|
|||
|
|
|
|||
|
|
Pi (Inflection AI) is not supported — it has no public API or function calling.
|
|||
|
|
"""
|
|||
|
|
from __future__ import annotations
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
|
|||
|
|
import httpx
|
|||
|
|
|
|||
|
|
TTS_URL = os.environ.get("TTS_URL", "http://127.0.0.1:9999").rstrip("/")
|
|||
|
|
|
|||
|
|
SYSTEM_PROMPT = """Du bist ein hilfreicher Sprachassistent.
|
|||
|
|
Du hast Zugriff auf einen Text-to-Speech-Service mit folgenden Tools:
|
|||
|
|
- speak(text, lang, speed, interrupt): Text vorlesen lassen
|
|||
|
|
- stop(): laufende Ausgabe stoppen
|
|||
|
|
- get_status(): Ausgabe-Status abfragen
|
|||
|
|
|
|||
|
|
Wenn der Nutzer darum bittet, etwas vorzulesen, oder wenn du eine längere Antwort
|
|||
|
|
gibst, die zum Vorlesen geeignet ist, ruf speak() mit dem entsprechenden Text auf.
|
|||
|
|
Antworte auf Deutsch, sofern nicht anders gewünscht."""
|
|||
|
|
|
|||
|
|
TOOLS = [
|
|||
|
|
{
|
|||
|
|
"type": "function",
|
|||
|
|
"function": {
|
|||
|
|
"name": "speak",
|
|||
|
|
"description": (
|
|||
|
|
"Text als Sprache ausgeben. Reiht den Text in die Warteschlange ein "
|
|||
|
|
"und gibt sofort zurück. Ideal für längere Texte oder wenn der Nutzer "
|
|||
|
|
"etwas vorgelesen haben möchte."
|
|||
|
|
),
|
|||
|
|
"parameters": {
|
|||
|
|
"type": "object",
|
|||
|
|
"properties": {
|
|||
|
|
"text": {
|
|||
|
|
"type": "string",
|
|||
|
|
"description": "Auszugebender Text (max. 4000 Zeichen).",
|
|||
|
|
},
|
|||
|
|
"lang": {
|
|||
|
|
"type": "string",
|
|||
|
|
"default": "de",
|
|||
|
|
"description": "Sprachcode: 'de', 'en', 'fr', 'es', … Standard: 'de'.",
|
|||
|
|
},
|
|||
|
|
"speed": {
|
|||
|
|
"type": "number",
|
|||
|
|
"default": 1.0,
|
|||
|
|
"description": "Geschwindigkeit 0.5–2.0. Standard: 1.0.",
|
|||
|
|
},
|
|||
|
|
"interrupt": {
|
|||
|
|
"type": "boolean",
|
|||
|
|
"default": False,
|
|||
|
|
"description": "True = laufende Ausgabe sofort unterbrechen.",
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
"required": ["text"],
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "function",
|
|||
|
|
"function": {
|
|||
|
|
"name": "stop",
|
|||
|
|
"description": "Laufende Sprachausgabe sofort stoppen und Warteschlange leeren.",
|
|||
|
|
"parameters": {"type": "object", "properties": {}},
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
{
|
|||
|
|
"type": "function",
|
|||
|
|
"function": {
|
|||
|
|
"name": "get_status",
|
|||
|
|
"description": "Aktuellen Ausgabe-Status abfragen (laufender Job, Queue-Länge).",
|
|||
|
|
"parameters": {"type": "object", "properties": {}},
|
|||
|
|
},
|
|||
|
|
},
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
|
|||
|
|
def _call_tts(name: str, args: dict) -> str:
|
|||
|
|
try:
|
|||
|
|
with httpx.Client(timeout=15) as client:
|
|||
|
|
if name == "speak":
|
|||
|
|
r = client.post(f"{TTS_URL}/speak", json=args)
|
|||
|
|
elif name == "stop":
|
|||
|
|
r = client.post(f"{TTS_URL}/stop")
|
|||
|
|
elif name == "get_status":
|
|||
|
|
r = client.get(f"{TTS_URL}/status")
|
|||
|
|
else:
|
|||
|
|
return json.dumps({"error": f"Unbekanntes Tool: {name}"})
|
|||
|
|
r.raise_for_status()
|
|||
|
|
return r.text
|
|||
|
|
except httpx.ConnectError:
|
|||
|
|
return json.dumps({"error": f"TTS-Service nicht erreichbar: {TTS_URL}"})
|
|||
|
|
except Exception as exc: # noqa: BLE001
|
|||
|
|
return json.dumps({"error": str(exc)})
|
|||
|
|
|
|||
|
|
|
|||
|
|
def run_agent(model: str, base_url: str, system_prompt: str, voice: str | None) -> None:
|
|||
|
|
try:
|
|||
|
|
from openai import OpenAI
|
|||
|
|
except ImportError:
|
|||
|
|
print("Fehler: 'openai' nicht installiert. → pip install openai", file=sys.stderr)
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
client = OpenAI(
|
|||
|
|
base_url=base_url,
|
|||
|
|
api_key=os.environ.get("OPENAI_API_KEY", "ollama"),
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
if voice:
|
|||
|
|
system_prompt += f"\n\nWenn du speak() aufrufst, übergib immer voice='{voice}'."
|
|||
|
|
|
|||
|
|
messages: list[dict] = [{"role": "system", "content": system_prompt}]
|
|||
|
|
|
|||
|
|
# IMPROVE-7: TTS-Service-Erreichbarkeit früh prüfen
|
|||
|
|
try:
|
|||
|
|
with httpx.Client(timeout=5) as _hc:
|
|||
|
|
_hc.get(f"{TTS_URL}/health")
|
|||
|
|
except Exception as _e:
|
|||
|
|
print(f"Fehler: TTS-Service nicht erreichbar ({TTS_URL}): {_e}", file=sys.stderr)
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
print(f"Agent gestartet | Modell: {model} | TTS: {TTS_URL}")
|
|||
|
|
print("Beenden mit 'exit' oder Ctrl+C.\n")
|
|||
|
|
|
|||
|
|
while True:
|
|||
|
|
try:
|
|||
|
|
user_input = input("Du: ").strip()
|
|||
|
|
except (KeyboardInterrupt, EOFError):
|
|||
|
|
print("\nTschüss!")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if user_input.lower() in ("exit", "quit", "bye", "tschüss"):
|
|||
|
|
print("Tschüss!")
|
|||
|
|
break
|
|||
|
|
if not user_input:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
messages.append({"role": "user", "content": user_input})
|
|||
|
|
|
|||
|
|
# BUG-3: Maximale Iterationen verhindern Endlosschleife bei LLMs,
|
|||
|
|
# die wiederholt Tool-Calls ohne abschließende Antwort produzieren.
|
|||
|
|
max_iterations = 10
|
|||
|
|
for _iteration in range(max_iterations):
|
|||
|
|
try:
|
|||
|
|
resp = client.chat.completions.create(
|
|||
|
|
model=model,
|
|||
|
|
messages=messages,
|
|||
|
|
tools=TOOLS,
|
|||
|
|
tool_choice="auto",
|
|||
|
|
)
|
|||
|
|
except Exception as exc: # noqa: BLE001
|
|||
|
|
print(f"[LLM-Fehler] {exc}", file=sys.stderr)
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
msg = resp.choices[0].message
|
|||
|
|
# BUG-6: explizites Dict statt model_dump(exclude_unset=True),
|
|||
|
|
# damit 'role' auch bei strict-kompatiblen Backends immer vorhanden ist.
|
|||
|
|
msg_dict: dict = {"role": msg.role, "content": msg.content}
|
|||
|
|
if msg.tool_calls:
|
|||
|
|
msg_dict["tool_calls"] = [tc.model_dump() for tc in msg.tool_calls]
|
|||
|
|
messages.append(msg_dict)
|
|||
|
|
|
|||
|
|
if not msg.tool_calls:
|
|||
|
|
if msg.content:
|
|||
|
|
print(f"\nAssistent: {msg.content}\n")
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
for tc in msg.tool_calls:
|
|||
|
|
args = json.loads(tc.function.arguments)
|
|||
|
|
arg_str = ", ".join(f"{k}={v!r}" for k, v in args.items())
|
|||
|
|
print(f" [{tc.function.name}({arg_str})]")
|
|||
|
|
result = _call_tts(tc.function.name, args)
|
|||
|
|
messages.append({
|
|||
|
|
"role": "tool",
|
|||
|
|
"tool_call_id": tc.id,
|
|||
|
|
"content": result,
|
|||
|
|
})
|
|||
|
|
else:
|
|||
|
|
print(f"[Agent] Maximale Tool-Call-Iterationen ({max_iterations}) erreicht.", file=sys.stderr)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main() -> None:
|
|||
|
|
p = argparse.ArgumentParser(
|
|||
|
|
description="TTS Agent — LLM mit speak/stop/status Tools",
|
|||
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|||
|
|
epilog=__doc__,
|
|||
|
|
)
|
|||
|
|
p.add_argument(
|
|||
|
|
"--model", default="qwen2.5",
|
|||
|
|
help="Modellname (Standard: qwen2.5). Für OpenAI z. B. 'gpt-4o'.",
|
|||
|
|
)
|
|||
|
|
p.add_argument(
|
|||
|
|
"--base-url", default="http://localhost:11434/v1",
|
|||
|
|
help="OpenAI-kompatibler API-Endpunkt (Standard: Ollama auf Port 11434).",
|
|||
|
|
)
|
|||
|
|
p.add_argument(
|
|||
|
|
"--system-prompt", default=SYSTEM_PROMPT,
|
|||
|
|
help="System-Prompt überschreiben.",
|
|||
|
|
)
|
|||
|
|
p.add_argument(
|
|||
|
|
"--voice", default=None,
|
|||
|
|
help="Pfad zu einer WAV-Referenzdatei für Voice Cloning (wird an speak() weitergegeben).",
|
|||
|
|
)
|
|||
|
|
p.add_argument(
|
|||
|
|
"--lang", default=None,
|
|||
|
|
help="Sprache für speak() überschreiben (z. B. 'en'). Standard: Modell entscheidet.",
|
|||
|
|
)
|
|||
|
|
args = p.parse_args()
|
|||
|
|
|
|||
|
|
system = args.system_prompt
|
|||
|
|
if args.lang:
|
|||
|
|
system += f"\n\nVerwende immer lang='{args.lang}' beim Aufruf von speak()."
|
|||
|
|
|
|||
|
|
run_agent(
|
|||
|
|
model=args.model,
|
|||
|
|
base_url=args.base_url,
|
|||
|
|
system_prompt=system,
|
|||
|
|
voice=args.voice,
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|