llama-server/run_qwen35b_server_tools.sh

89 lines
2.4 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
# Konfiguration
HF_HOME="${HF_HOME:-/home/dschlueter/nvme2n1p7_home/huggingface}"
MODEL_REL_PATH="models/qwen3/Carnice-Qwen3.6-MoE-35B-A3B-Q4_K_M.gguf"
IMAGE="ghcr.io/ggml-org/llama.cpp:server-cuda"
CONTAINER_NAME="qwen35b-moe-tools"
HOST_PORT=8000
CONTAINER_PORT=8000
echo "[*] Verwende HF_HOME = $HF_HOME"
if [ ! -f "$HF_HOME/$MODEL_REL_PATH" ]; then
echo "[!] Modell-Datei nicht gefunden: $HF_HOME/$MODEL_REL_PATH" >&2
exit 1
fi
# Optional: altes gleichnamiges Container-Exemplar stoppen
if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}\$"; then
echo "[*] Stoppe existierenden Container $CONTAINER_NAME ..."
docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true
fi
echo "[*] Starte llama.cpp-Server-Container ($IMAGE) ..."
docker run -d \
--gpus '"device=1,2"' \
--name "$CONTAINER_NAME" \
--restart unless-stopped \
-e HF_HOME="/hf_home" \
-v "$HF_HOME:/hf_home:ro" \
-p "${HOST_PORT}:${CONTAINER_PORT}" \
"$IMAGE" \
-m "/hf_home/${MODEL_REL_PATH}" \
-c 262144 \
-n 16384 \
--temp 0.3 --top-p 0.95 --top-k 40 --min-p 0.01 --repeat-penalty 1.05 \
--main-gpu 0 \
--tensor-split 0.5,0.5 \
-ngl 999 \
-fa on \
--kv-unified \
--cache-type-k q8_0 \
--cache-type-v q8_0 \
--batch-size 2048 \
--ubatch-size 512 \
--parallel 2 \
--cont-batching \
--jinja \
--no-context-shift \
--host 0.0.0.0 \
--port "$CONTAINER_PORT"
echo "[*] Container gestartet: $CONTAINER_NAME"
echo "[*] Warte, bis HTTP-Port ${HOST_PORT} antwortet ..."
for i in {1..60}; do
if curl -s "http://localhost:${HOST_PORT}/" >/dev/null 2>&1; then
echo "[*] Server antwortet auf http://localhost:${HOST_PORT}/"
break
fi
echo "[*] Warte (${i}/60) ..."
sleep 2
done
sleep 5
echo "[*] Sende Test-Chat-Request an /v1/chat/completions ..."
RESPONSE="$(curl -s -X POST "http://localhost:${HOST_PORT}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3.6-35b-a3b-moe",
"messages": [
{ "role": "system", "content": "Du bist ein hilfreicher deutscher Assistent." },
{ "role": "user", "content": "Gib eine sehr kurze Selbstdiagnose deiner Fähigkeiten." }
],
"max_tokens": 64,
"temperature": 0.3,
"stream": false
}')"
echo
echo "[*] Antwort vom Server:"
echo "$RESPONSE"
echo
echo "[*] Zum Stoppen des Servers:"
echo " docker rm -f $CONTAINER_NAME"