Initial commit: Qwen3.6-MoE-35B-A3B server configuration and documentation

2026-05-11 15:01:09 +02:00 · 2026-05-11 15:01:09 +02:00 · b039061615
commit b039061615
16 changed files with 1672 additions and 0 deletions
--- a/run_qwen35b_cli_tools_rag_longctx.sh
+++ b/run_qwen35b_cli_tools_rag_longctx.sh
@ -0,0 +1,25 @@
+
+docker run --rm -it \
+  --gpus '"device=1,2"' \
+  -p 8000:8000 \
+  -v "$HF_HOME/models/qwen3:/models" \
+  ghcr.io/ggml-org/llama.cpp:server-cuda \
+  -m /models/Carnice-Qwen3.6-MoE-35B-A3B-Q4_K_M.gguf \
+  -c 262144 \
+  -n 16384 \
+  --jinja \
+  --no-context-shift \
+  --temp 0.2 --top-p 0.95 --top-k 40 --min-p 0.01 --repeat-penalty 1.05 \
+  --main-gpu 0 \
+  --tensor-split 0.5,0.5 \
+  -ngl 999 \
+  -fa on \
+  --kv-unified \
+  --cache-type-k q8_0 \
+  --cache-type-v q8_0 \
+  --batch-size 2048 \
+  --ubatch-size 512 \
+  --parallel 2 \
+  --cont-batching \
+  --host 0.0.0.0 \
+  --port 8000