Add Qwopus3.6 coding docker-compose configuration
This commit is contained in:
parent
b039061615
commit
260cb22740
1 changed files with 95 additions and 0 deletions
95
docker-compose_Qwen3.6_Qwopus3.6_coding.yml
Normal file
95
docker-compose_Qwen3.6_Qwopus3.6_coding.yml
Normal file
|
|
@ -0,0 +1,95 @@
|
||||||
|
services:
|
||||||
|
qwen35b:
|
||||||
|
image: ghcr.io/ggml-org/llama.cpp:server-cuda
|
||||||
|
container_name: qwopus35b-moe-coding # eigener Name: vermeidet Konflikt mit qwen35b-moe-coding
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
|
||||||
|
environment:
|
||||||
|
HF_HOME: /hf_home
|
||||||
|
NVIDIA_VISIBLE_DEVICES: "1,2" # Im Host‑System: 3090 = 1,2; T600 = 0
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- /home/dschlueter/nvme2n1p7_home/huggingface:/hf_home:ro
|
||||||
|
|
||||||
|
command:
|
||||||
|
- -m
|
||||||
|
- /hf_home/models/qwen3/Qwopus3.6-35B-A3B-v1-Q4_K_M.gguf
|
||||||
|
# --mmproj hier ergänzen sobald mmproj-Datei heruntergeladen:
|
||||||
|
- --mmproj
|
||||||
|
# - /hf_home/models/qwen3/mmproj-Qwopus3.6-35B-A3B-v1-f16.gguf
|
||||||
|
- /hf_home/models/qwen3/mmproj.gguf
|
||||||
|
|
||||||
|
# Kontext & Ausgabe
|
||||||
|
|
||||||
|
- -c
|
||||||
|
- "262144" # 256k: ideal für große Codeprojekte mit vielen Dateien im Kontext
|
||||||
|
- -n
|
||||||
|
- "16384" # 16k: reicht für komplexe Klassen, ganze Dateien, lange Erklärungen
|
||||||
|
|
||||||
|
# Sampler
|
||||||
|
|
||||||
|
- --temp
|
||||||
|
- "0.3" # Kompromiss: niedrig genug für edit-Tool-Präzision, variabel genug für kreatives Coding
|
||||||
|
- --top-p
|
||||||
|
- "0.95" # Qwen-Empfehlung
|
||||||
|
- --top-k
|
||||||
|
- "40" # Qwen-Empfehlung
|
||||||
|
- --min-p
|
||||||
|
- "0.01" # stabilisiert Sampling-Verteilung
|
||||||
|
- --repeat-penalty
|
||||||
|
- "1.05" # minimal: verhindert Text-Wiederholungsschleifen, schadet edit-Tool kaum
|
||||||
|
|
||||||
|
# GPU-/Multi-GPU-Setup
|
||||||
|
- --main-gpu
|
||||||
|
- "0" # erste 3090 als Haupt-GPU im Container
|
||||||
|
- --tensor-split
|
||||||
|
- "0.5,0.5" # symmetrisch: beide 3090 haben je 24 GB VRAM
|
||||||
|
- -ngl
|
||||||
|
- "999" # alle Layer auf GPU auslagern
|
||||||
|
- -fa
|
||||||
|
- "on" # Flash Attention: optimierte Speicherzugriffe und Matmul
|
||||||
|
|
||||||
|
# KV-Cache
|
||||||
|
- --kv-unified
|
||||||
|
- --cache-type-k
|
||||||
|
- q8_0 # guter Speed/Qualitäts-Kompromiss
|
||||||
|
- --cache-type-v
|
||||||
|
- q8_0
|
||||||
|
|
||||||
|
# Batching & Parallelität
|
||||||
|
- --batch-size
|
||||||
|
- "2048" # großer Prompt-Batch: schnellere Verarbeitung langer Datei-Kontexte
|
||||||
|
- --ubatch-size
|
||||||
|
- "1024" # SSM-Layer verarbeiten Micro-Batches effizienter als reiner Transformer
|
||||||
|
- --parallel
|
||||||
|
- "4" # Qwopus KV-Cache ~2.5 GB/Slot (2 KV-Heads × 10 Attention-Layer) → 4 Slots problemlos
|
||||||
|
- --cont-batching # kontinuierliches Batching aktivieren
|
||||||
|
|
||||||
|
|
||||||
|
# Server
|
||||||
|
|
||||||
|
- --jinja
|
||||||
|
- --no-context-shift
|
||||||
|
- --host
|
||||||
|
- 0.0.0.0
|
||||||
|
- --port
|
||||||
|
- "8000"
|
||||||
|
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD-SHELL", "curl -fs http://localhost:8000/ || exit 1"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
start_period: 120s
|
||||||
|
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
device_ids: ["1", "2"]
|
||||||
|
capabilities: [gpu]
|
||||||
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue