#!/usr/bin/env bash set -euo pipefail HF_HOME="${HF_HOME:-/home/dschlueter/nvme2n1p7_home/huggingface}" MODEL_REL_PATH="models/qwen3/Qwen3.6-27B-Uncensored-HauhauCS-Aggressive-IQ4_XS.gguf" IMAGE="ghcr.io/ggml-org/llama.cpp:server-cuda" CONTAINER_NAME="qwen36-27b-judge" HOST_PORT=8002 CONTAINER_PORT=8000 MODEL_ALIAS="qwen3.5-judge" echo "[*] Verwende HF_HOME = $HF_HOME" if [ ! -f "$HF_HOME/$MODEL_REL_PATH" ]; then echo "[!] Modell-Datei nicht gefunden: $HF_HOME/$MODEL_REL_PATH" >&2 exit 1 fi if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}\$"; then echo "[*] Stoppe existierenden Container $CONTAINER_NAME ..." docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true fi echo "[*] Starte llama.cpp-Server für Judge ..." docker run -d \ --gpus '"device=1,2"' \ --name "$CONTAINER_NAME" \ --restart unless-stopped \ -e HF_HOME="/hf_home" \ -v "$HF_HOME:/hf_home:ro" \ -p "${HOST_PORT}:${CONTAINER_PORT}" \ "$IMAGE" \ -m "/hf_home/${MODEL_REL_PATH}" \ --alias "${MODEL_ALIAS}" \ -c 262144 \ -n 16384 \ --jinja \ --chat-template-kwargs '{"enable_thinking":true}' \ --no-context-shift \ --temp 0.7 \ --top-p 0.80 \ --top-k 20 \ --min-p 0.01 \ --repeat-penalty 1.05 \ --main-gpu 0 \ --tensor-split 0.5,0.5 \ -ngl 999 \ -fa on \ --kv-unified \ --cache-type-k q4_0 \ --cache-type-v q4_0 \ --batch-size 512 \ --ubatch-size 256 \ --parallel 1 \ --cont-batching \ --host 0.0.0.0 \ --port "$CONTAINER_PORT" echo "[*] Warte auf Modell-Bereitschaft (Completion-Check, max. 180 s) ..." MODEL_READY=0 for i in {1..90}; do HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 \ -X POST "http://localhost:${HOST_PORT}/v1/chat/completions" \ -H "Content-Type: application/json" \ -d "{\"model\":\"${MODEL_ALIAS}\",\"messages\":[{\"role\":\"user\",\"content\":\"ping\"}],\"max_tokens\":1,\"temperature\":0.0,\"stream\":false}") if [ "$HTTP_CODE" = "200" ]; then MODEL_READY=1; break; fi echo " [${i}/90] HTTP ${HTTP_CODE:-000} — Modell lädt noch, warte 2s ..." sleep 2 done if [ "$MODEL_READY" -ne 1 ]; then echo "[!] Modell wurde nicht rechtzeitig bereit (kein HTTP 200 auf Completion)." >&2 docker logs --tail 200 "$CONTAINER_NAME" || true exit 1 fi echo "[*] Modell bereit — erster Completion-Request erfolgreich (HTTP 200)." echo "[*] Server läuft auf http://0.0.0.0:${HOST_PORT}" echo "[*] Stoppen mit: docker rm -f ${CONTAINER_NAME}"