Initial commit: Qwen3.6-MoE-35B-A3B server configuration and documentation
This commit is contained in:
commit
b039061615
16 changed files with 1672 additions and 0 deletions
25
run_qwen35b_cli_tools_rag_longctx.sh
Executable file
25
run_qwen35b_cli_tools_rag_longctx.sh
Executable file
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
docker run --rm -it \
|
||||
--gpus '"device=1,2"' \
|
||||
-p 8000:8000 \
|
||||
-v "$HF_HOME/models/qwen3:/models" \
|
||||
ghcr.io/ggml-org/llama.cpp:server-cuda \
|
||||
-m /models/Carnice-Qwen3.6-MoE-35B-A3B-Q4_K_M.gguf \
|
||||
-c 262144 \
|
||||
-n 16384 \
|
||||
--jinja \
|
||||
--no-context-shift \
|
||||
--temp 0.2 --top-p 0.95 --top-k 40 --min-p 0.01 --repeat-penalty 1.05 \
|
||||
--main-gpu 0 \
|
||||
--tensor-split 0.5,0.5 \
|
||||
-ngl 999 \
|
||||
-fa on \
|
||||
--kv-unified \
|
||||
--cache-type-k q8_0 \
|
||||
--cache-type-v q8_0 \
|
||||
--batch-size 2048 \
|
||||
--ubatch-size 512 \
|
||||
--parallel 2 \
|
||||
--cont-batching \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000
|
||||
Loading…
Add table
Add a link
Reference in a new issue