91 lines
2.5 KiB
YAML
91 lines
2.5 KiB
YAML
|
|
services:
|
|||
|
|
qwen35b:
|
|||
|
|
image: ghcr.io/ggml-org/llama.cpp:server-cuda
|
|||
|
|
container_name: qwen35b-moe-coding
|
|||
|
|
restart: unless-stopped
|
|||
|
|
|
|||
|
|
ports:
|
|||
|
|
- "8000:8000"
|
|||
|
|
|
|||
|
|
environment:
|
|||
|
|
HF_HOME: /hf_home
|
|||
|
|
NVIDIA_VISIBLE_DEVICES: "1,2" # Im Host‑System: 3090 = 1,2; T600 = 0
|
|||
|
|
|
|||
|
|
volumes:
|
|||
|
|
- /home/dschlueter/nvme2n1p7_home/huggingface:/hf_home:ro
|
|||
|
|
|
|||
|
|
command:
|
|||
|
|
- -m
|
|||
|
|
- /hf_home/models/qwen3/Carnice-Qwen3.6-MoE-35B-A3B-Q4_K_M.gguf
|
|||
|
|
|
|||
|
|
# Kontext & Ausgabe
|
|||
|
|
|
|||
|
|
- -c
|
|||
|
|
- "262144" # 256k: ideal für große Codeprojekte mit vielen Dateien im Kontext
|
|||
|
|
- -n
|
|||
|
|
- "16384" # 16k: reicht für komplexe Klassen, ganze Dateien, lange Erklärungen
|
|||
|
|
|
|||
|
|
# Sampler
|
|||
|
|
|
|||
|
|
- --temp
|
|||
|
|
- "0.3" # Kompromiss: niedrig genug für edit-Tool-Präzision, variabel genug für kreatives Coding
|
|||
|
|
- --top-p
|
|||
|
|
- "0.95" # Qwen-Empfehlung
|
|||
|
|
- --top-k
|
|||
|
|
- "40" # Qwen-Empfehlung
|
|||
|
|
- --min-p
|
|||
|
|
- "0.01" # stabilisiert Sampling-Verteilung
|
|||
|
|
- --repeat-penalty
|
|||
|
|
- "1.05" # minimal: verhindert Text-Wiederholungsschleifen, schadet edit-Tool kaum
|
|||
|
|
|
|||
|
|
# GPU-/Multi-GPU-Setup
|
|||
|
|
- --main-gpu
|
|||
|
|
- "0" # erste 3090 als Haupt-GPU im Container
|
|||
|
|
- --tensor-split
|
|||
|
|
- "0.5,0.5" # symmetrisch: beide 3090 haben je 24 GB VRAM
|
|||
|
|
- -ngl
|
|||
|
|
- "999" # alle Layer auf GPU auslagern
|
|||
|
|
- -fa
|
|||
|
|
- "on" # Flash Attention: optimierte Speicherzugriffe und Matmul
|
|||
|
|
|
|||
|
|
# KV-Cache
|
|||
|
|
- --kv-unified
|
|||
|
|
- --cache-type-k
|
|||
|
|
- q8_0 # guter Speed/Qualitäts-Kompromiss
|
|||
|
|
- --cache-type-v
|
|||
|
|
- q8_0
|
|||
|
|
|
|||
|
|
# Batching & Parallelität
|
|||
|
|
- --batch-size
|
|||
|
|
- "2048" # großer Prompt-Batch: schnellere Verarbeitung langer Datei-Kontexte
|
|||
|
|
- --ubatch-size
|
|||
|
|
- "512" # passend zu batch-size
|
|||
|
|
- --parallel
|
|||
|
|
- "2" # 2 parallele Slots für Single-User: spart ~10 GB KV-Cache vs. 4
|
|||
|
|
- --cont-batching # kontinuierliches Batching aktivieren
|
|||
|
|
|
|||
|
|
|
|||
|
|
# Server
|
|||
|
|
|
|||
|
|
- --jinja
|
|||
|
|
- --no-context-shift
|
|||
|
|
- --host
|
|||
|
|
- 0.0.0.0
|
|||
|
|
- --port
|
|||
|
|
- "8000"
|
|||
|
|
|
|||
|
|
healthcheck:
|
|||
|
|
test: ["CMD-SHELL", "curl -fs http://localhost:8000/ || exit 1"]
|
|||
|
|
interval: 30s
|
|||
|
|
timeout: 5s
|
|||
|
|
retries: 3
|
|||
|
|
start_period: 120s
|
|||
|
|
|
|||
|
|
deploy:
|
|||
|
|
resources:
|
|||
|
|
reservations:
|
|||
|
|
devices:
|
|||
|
|
- driver: nvidia
|
|||
|
|
device_ids: ["1", "2"]
|
|||
|
|
capabilities: [gpu]
|
|||
|
|
|