llama-server/run_qwen35b_cli_uncensored_rag_longctx.sh

25 lines
605 B
Bash
Executable file

docker run --rm -it \
--gpus '"device=1,2"' \
-p 8000:8000 \
-v "$HF_HOME/models/qwen3:/models" \
ghcr.io/ggml-org/llama.cpp:server-cuda \
-m /models/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf \
-c 262144 \
-n 16384 \
--jinja \
--no-context-shift \
--temp 0.2 --top-p 0.95 --top-k 40 --min-p 0.01 --repeat-penalty 1.05 \
--main-gpu 0 \
--tensor-split 0.5,0.5 \
-ngl 999 \
-fa on \
--kv-unified \
--cache-type-k q8_0 \
--cache-type-v q8_0 \
--batch-size 2048 \
--ubatch-size 512 \
--parallel 2 \
--cont-batching \
--host 0.0.0.0 \
--port 8000