Initial commit: Qwen3.6-MoE-35B-A3B server configuration and documentation
This commit is contained in:
commit
b039061615
16 changed files with 1672 additions and 0 deletions
91
docker-compose_Qwen3.6_Tools_coding.yml
Normal file
91
docker-compose_Qwen3.6_Tools_coding.yml
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
services:
|
||||
qwen35b:
|
||||
image: ghcr.io/ggml-org/llama.cpp:server-cuda
|
||||
container_name: qwen35b-moe-coding
|
||||
restart: unless-stopped
|
||||
|
||||
ports:
|
||||
- "8000:8000"
|
||||
|
||||
environment:
|
||||
HF_HOME: /hf_home
|
||||
NVIDIA_VISIBLE_DEVICES: "1,2" # Im Host‑System: 3090 = 1,2; T600 = 0
|
||||
|
||||
volumes:
|
||||
- /home/dschlueter/nvme2n1p7_home/huggingface:/hf_home:ro
|
||||
|
||||
command:
|
||||
- -m
|
||||
- /hf_home/models/qwen3/Carnice-Qwen3.6-MoE-35B-A3B-Q4_K_M.gguf
|
||||
|
||||
# Kontext & Ausgabe
|
||||
|
||||
- -c
|
||||
- "262144" # 256k: ideal für große Codeprojekte mit vielen Dateien im Kontext
|
||||
- -n
|
||||
- "16384" # 16k: reicht für komplexe Klassen, ganze Dateien, lange Erklärungen
|
||||
|
||||
# Sampler
|
||||
|
||||
- --temp
|
||||
- "0.3" # Kompromiss: niedrig genug für edit-Tool-Präzision, variabel genug für kreatives Coding
|
||||
- --top-p
|
||||
- "0.95" # Qwen-Empfehlung
|
||||
- --top-k
|
||||
- "40" # Qwen-Empfehlung
|
||||
- --min-p
|
||||
- "0.01" # stabilisiert Sampling-Verteilung
|
||||
- --repeat-penalty
|
||||
- "1.05" # minimal: verhindert Text-Wiederholungsschleifen, schadet edit-Tool kaum
|
||||
|
||||
# GPU-/Multi-GPU-Setup
|
||||
- --main-gpu
|
||||
- "0" # erste 3090 als Haupt-GPU im Container
|
||||
- --tensor-split
|
||||
- "0.5,0.5" # symmetrisch: beide 3090 haben je 24 GB VRAM
|
||||
- -ngl
|
||||
- "999" # alle Layer auf GPU auslagern
|
||||
- -fa
|
||||
- "on" # Flash Attention: optimierte Speicherzugriffe und Matmul
|
||||
|
||||
# KV-Cache
|
||||
- --kv-unified
|
||||
- --cache-type-k
|
||||
- q8_0 # guter Speed/Qualitäts-Kompromiss
|
||||
- --cache-type-v
|
||||
- q8_0
|
||||
|
||||
# Batching & Parallelität
|
||||
- --batch-size
|
||||
- "2048" # großer Prompt-Batch: schnellere Verarbeitung langer Datei-Kontexte
|
||||
- --ubatch-size
|
||||
- "512" # passend zu batch-size
|
||||
- --parallel
|
||||
- "2" # 2 parallele Slots für Single-User: spart ~10 GB KV-Cache vs. 4
|
||||
- --cont-batching # kontinuierliches Batching aktivieren
|
||||
|
||||
|
||||
# Server
|
||||
|
||||
- --jinja
|
||||
- --no-context-shift
|
||||
- --host
|
||||
- 0.0.0.0
|
||||
- --port
|
||||
- "8000"
|
||||
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -fs http://localhost:8000/ || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 120s
|
||||
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
device_ids: ["1", "2"]
|
||||
capabilities: [gpu]
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue