138 lines
3.9 KiB
Bash
138 lines
3.9 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
# Konfiguration
|
||
|
|
HF_HOME="${HF_HOME:-/home/dschlueter/nvme2n1p7_home/huggingface}"
|
||
|
|
MODEL_REL_PATH="models/qwen3/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
|
||
|
|
IMAGE="ghcr.io/ggml-org/llama.cpp:server-cuda"
|
||
|
|
CONTAINER_NAME="qwen35b-moe-uncensored-rag-longctx"
|
||
|
|
HOST_PORT=8000
|
||
|
|
CONTAINER_PORT=8000
|
||
|
|
|
||
|
|
echo "[*] Verwende HF_HOME = $HF_HOME"
|
||
|
|
if [ ! -f "$HF_HOME/$MODEL_REL_PATH" ]; then
|
||
|
|
echo "[!] Modell-Datei nicht gefunden: $HF_HOME/$MODEL_REL_PATH" >&2
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Optional: altes gleichnamiges Container-Exemplar stoppen
|
||
|
|
if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}\\$"; then
|
||
|
|
echo "[*] Stoppe existierenden Container $CONTAINER_NAME ..."
|
||
|
|
docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "[*] Starte llama.cpp-Server-Container ($IMAGE) ..."
|
||
|
|
echo "[*] Modus: Uncensored, RAG-fähig, Long Context"
|
||
|
|
|
||
|
|
docker run -d \
|
||
|
|
--gpus '"device=1,2"' \
|
||
|
|
--name "$CONTAINER_NAME" \
|
||
|
|
--restart unless-stopped \
|
||
|
|
-e HF_HOME="/hf_home" \
|
||
|
|
-v "$HF_HOME:/hf_home:ro" \
|
||
|
|
-p "${HOST_PORT}:${CONTAINER_PORT}" \
|
||
|
|
"$IMAGE" \
|
||
|
|
-m "/hf_home/${MODEL_REL_PATH}" \
|
||
|
|
-c 262144 \
|
||
|
|
-n 16384 \
|
||
|
|
--jinja \
|
||
|
|
--no-context-shift \
|
||
|
|
--temp 0.2 \
|
||
|
|
--top-p 0.95 \
|
||
|
|
--top-k 40 \
|
||
|
|
--min-p 0.01 \
|
||
|
|
--repeat-penalty 1.05 \
|
||
|
|
--main-gpu 0 \
|
||
|
|
--tensor-split 0.5,0.5 \
|
||
|
|
-ngl 999 \
|
||
|
|
-fa on \
|
||
|
|
--kv-unified \
|
||
|
|
--cache-type-k q8_0 \
|
||
|
|
--cache-type-v q8_0 \
|
||
|
|
--batch-size 2048 \
|
||
|
|
--ubatch-size 512 \
|
||
|
|
--parallel 2 \
|
||
|
|
--cont-batching \
|
||
|
|
--host 0.0.0.0 \
|
||
|
|
--port "$CONTAINER_PORT"
|
||
|
|
|
||
|
|
echo "[*] Container gestartet: $CONTAINER_NAME"
|
||
|
|
echo "[*] Warte, bis HTTP-Port ${HOST_PORT} antwortet ..."
|
||
|
|
|
||
|
|
HTTP_READY=0
|
||
|
|
for i in {1..90}; do
|
||
|
|
if curl -s "http://localhost:${HOST_PORT}/" >/dev/null 2>&1; then
|
||
|
|
echo "[*] Server antwortet auf http://localhost:${HOST_PORT}/"
|
||
|
|
HTTP_READY=1
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
echo "[*] Warte (${i}/90) auf HTTP ..."
|
||
|
|
sleep 2
|
||
|
|
done
|
||
|
|
|
||
|
|
if [ "$HTTP_READY" -ne 1 ]; then
|
||
|
|
echo "[!] HTTP-Server wurde nicht rechtzeitig erreichbar." >&2
|
||
|
|
echo "[*] Letzte Container-Logs:"
|
||
|
|
docker logs --tail 200 "$CONTAINER_NAME" || true
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "[*] Warte, bis das Modell wirklich geladen ist ..."
|
||
|
|
|
||
|
|
MODEL_READY=0
|
||
|
|
for i in {1..180}; do
|
||
|
|
HTTP_CODE="$(curl -s -o /tmp/${CONTAINER_NAME}_ready.json -w "%{http_code}" \
|
||
|
|
-X POST "http://localhost:${HOST_PORT}/v1/chat/completions" \
|
||
|
|
-H "Content-Type: application/json" \
|
||
|
|
-d '{
|
||
|
|
"model": "qwen3.6-35b-a3b-moe-rag-longctx",
|
||
|
|
"messages": [
|
||
|
|
{ "role": "system", "content": "Du bist ein hilfreicher deutscher Assistent." },
|
||
|
|
{ "role": "user", "content": "Antworte nur mit dem Wort: bereit" }
|
||
|
|
],
|
||
|
|
"max_tokens": 8,
|
||
|
|
"temperature": 0.0,
|
||
|
|
"stream": false
|
||
|
|
}' || true)"
|
||
|
|
|
||
|
|
BODY="$(cat /tmp/${CONTAINER_NAME}_ready.json 2>/dev/null || true)"
|
||
|
|
|
||
|
|
if [ "$HTTP_CODE" = "200" ]; then
|
||
|
|
echo "[*] Modell ist geladen und antwortet."
|
||
|
|
MODEL_READY=1
|
||
|
|
break
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "[*] Warte (${i}/180) auf Modell ... HTTP ${HTTP_CODE} - ${BODY}"
|
||
|
|
sleep 5
|
||
|
|
done
|
||
|
|
|
||
|
|
if [ "$MODEL_READY" -ne 1 ]; then
|
||
|
|
echo "[!] Modell wurde nicht rechtzeitig bereit." >&2
|
||
|
|
echo "[*] Letzte Container-Logs:"
|
||
|
|
docker logs --tail 200 "$CONTAINER_NAME" || true
|
||
|
|
exit 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo "[*] Sende finalen Test-Chat-Request an /v1/chat/completions ..."
|
||
|
|
|
||
|
|
RESPONSE="$(curl -s -X POST "http://localhost:${HOST_PORT}/v1/chat/completions" \
|
||
|
|
-H "Content-Type: application/json" \
|
||
|
|
-d '{
|
||
|
|
"model": "qwen3.6-35b-a3b-moe-rag-longctx",
|
||
|
|
"messages": [
|
||
|
|
{ "role": "system", "content": "Du bist ein hilfreicher deutscher Assistent für RAG-gestützte Wissensarbeit." },
|
||
|
|
{ "role": "user", "content": "Antworte in einem Satz: Der Server für sehr langen Kontext ist betriebsbereit." }
|
||
|
|
],
|
||
|
|
"max_tokens": 64,
|
||
|
|
"temperature": 0.2,
|
||
|
|
"stream": false
|
||
|
|
}')"
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "[*] Antwort vom Server:"
|
||
|
|
echo "$RESPONSE"
|
||
|
|
|
||
|
|
echo
|
||
|
|
echo "[*] Zum Stoppen des Servers:"
|
||
|
|
echo " docker rm -f $CONTAINER_NAME"
|