Initial commit: Qwen3.6-MoE-35B-A3B server configuration and documentation
This commit is contained in:
commit
b039061615
16 changed files with 1672 additions and 0 deletions
81
run_bge_m3_embedding_server.sh
Executable file
81
run_bge_m3_embedding_server.sh
Executable file
|
|
@ -0,0 +1,81 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Konfiguration
|
||||
HF_HOME="${HF_HOME:-/home/dschlueter/nvme2n1p7_home/huggingface}"
|
||||
MODEL_REL_PATH="models/embeddings/bge-m3-q8_0.gguf"
|
||||
IMAGE="ghcr.io/ggml-org/llama.cpp:server-cuda"
|
||||
CONTAINER_NAME="qwen-embeddings"
|
||||
HOST_PORT=8001
|
||||
CONTAINER_PORT=8001
|
||||
|
||||
echo "[*] Verwende HF_HOME = $HF_HOME"
|
||||
if [ ! -f "$HF_HOME/$MODEL_REL_PATH" ]; then
|
||||
echo "[!] Embedding-Modell-Datei nicht gefunden: $HF_HOME/$MODEL_REL_PATH" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Optional: altes gleichnamiges Container-Exemplar stoppen
|
||||
if docker ps -a --format '{{.Names}}' | grep -q "^${CONTAINER_NAME}\\$"; then
|
||||
echo "[*] Stoppe existierenden Container $CONTAINER_NAME ..."
|
||||
docker rm -f "$CONTAINER_NAME" >/dev/null 2>&1 || true
|
||||
fi
|
||||
|
||||
echo "[*] Starte llama.cpp-Embedding-Server-Container ($IMAGE) ..."
|
||||
|
||||
docker run -d --gpus '"device=0"' \
|
||||
--name "$CONTAINER_NAME" \
|
||||
-e HF_HOME="/hf_home" \
|
||||
-v "$HF_HOME:/hf_home:ro" \
|
||||
-p "${HOST_PORT}:${CONTAINER_PORT}" \
|
||||
"$IMAGE" \
|
||||
--embedding \
|
||||
-m "/hf_home/${MODEL_REL_PATH}" \
|
||||
-c 8192 \
|
||||
-ngl 999 \
|
||||
-fa on \
|
||||
--batch-size 1024 \
|
||||
--ubatch-size 512 \
|
||||
--host 0.0.0.0 \
|
||||
--port "$CONTAINER_PORT"
|
||||
|
||||
echo "[*] Container gestartet: $CONTAINER_NAME"
|
||||
echo "[*] Warte, bis HTTP-Port ${HOST_PORT} antwortet ..."
|
||||
|
||||
READY=0
|
||||
for i in {1..60}; do
|
||||
if curl -s "http://localhost:${HOST_PORT}/" >/dev/null 2>&1; then
|
||||
echo "[*] Server antwortet auf http://localhost:${HOST_PORT}/"
|
||||
READY=1
|
||||
break
|
||||
fi
|
||||
echo "[*] Warte (${i}/60) ..."
|
||||
sleep 2
|
||||
done
|
||||
|
||||
if [ "$READY" -ne 1 ]; then
|
||||
echo "[!] Embedding-Server wurde nicht rechtzeitig erreichbar." >&2
|
||||
echo "[*] Letzte Container-Logs:"
|
||||
docker logs --tail 200 "$CONTAINER_NAME" || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 3
|
||||
|
||||
echo "[*] Sende Test-Embedding-Request an /v1/embeddings ..."
|
||||
|
||||
RESPONSE="$(curl -s -X POST "http://localhost:${HOST_PORT}/v1/embeddings" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "bge-m3-q8_0",
|
||||
"input": "Dies ist ein kurzer Testtext für den Embedding-Server."
|
||||
}')"
|
||||
|
||||
echo
|
||||
echo "[*] Antwort vom Server:"
|
||||
echo "$RESPONSE"
|
||||
|
||||
echo
|
||||
echo "[*] Zum Stoppen des Servers:"
|
||||
echo " docker rm -f $CONTAINER_NAME"
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue