💜Qwen3.5 - How to Run Locally Guide
Run the new Qwen3.5 LLMs including Medium: Qwen3.5-35B-A3B, 27B, 122B-A10B, Small: Qwen3.5-0.8B, 2B, 4B, 9B and 397B-A17B on your local device!
⚙️ Usage Guide
Qwen3.5
3-bit
4-bit
6-bit
8-bit
BF16
Recommended Settings
Thinking mode:
General tasks
Precise coding tasks (e.g. WebDev)
temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0temperature=0.6, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=0.0, repetition_penalty=1.0Instruct (non-thinking) mode settings:
General tasks
Reasoning tasks
temperature=0.7, top_p=0.8, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0temperature=1.0, top_p=0.95, top_k=20, min_p=0.0, presence_penalty=1.5, repetition_penalty=1.0Qwen3.5 Inference Tutorials:
Qwen3.5-35B-A3B
🦙 Llama.cpp Guides
1
apt-get update
apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
git clone https://github.com/ggml-org/llama.cpp
cmake llama.cpp -B llama.cpp/build \
-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON
cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-mtmd-cli llama-server llama-gguf-split
cp llama.cpp/build/bin/llama-* llama.cpp2
export LLAMA_CACHE="unsloth/Qwen3.5-35B-A3B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00export LLAMA_CACHE="unsloth/Qwen3.5-35B-A3B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 1.0 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00export LLAMA_CACHE="unsloth/Qwen3.5-35B-A3B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 0.7 \
--top-p 0.8 \
--top-k 20 \
--min-p 0.00 \
--chat-template-kwargs '{"enable_thinking":false}'export LLAMA_CACHE="unsloth/Qwen3.5-35B-A3B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-35B-A3B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 1.0 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00 \
--chat-template-kwargs '{"enable_thinking":false}'3
hf download unsloth/Qwen3.5-35B-A3B-GGUF \
--local-dir unsloth/Qwen3.5-35B-A3B-GGUF \
--include "*UD-Q4_K_XL*" # Use "*UD-Q2_K_XL*" for Dynamic 2bit4
./llama.cpp/llama-cli \
--model unsloth/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf \
--mmproj unsloth/Qwen3.5-35B-A3B-GGUF/mmproj-F16.gguf \
--seed 3407 \
--temp 1.0 \
--top-p 0.95 \
--min-p 0.01 \
--top-k 40Qwen3.5 Small (0.8B • 2B • 4B • 9B)
1
apt-get update
apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
git clone https://github.com/ggml-org/llama.cpp
cmake llama.cpp -B llama.cpp/build \
-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON
cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-mtmd-cli llama-server llama-gguf-split
cp llama.cpp/build/bin/llama-* llama.cpp2
export LLAMA_CACHE="unsloth/Qwen3.5-9B-GGUF"
./llama.cpp/llama-server \
-hf unsloth/Qwen3.5-9B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00 \
--alias "unsloth/Qwen3.5-9B-GGUF" \
--port 8001 \
--chat-template-kwargs '{"enable_thinking":true}'export LLAMA_CACHE="unsloth/Qwen3.5-9B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-9B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 1.0 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00 \
--alias "unsloth/Qwen3.5-9B-GGUF" \
--port 8001 \
--chat-template-kwargs '{"enable_thinking":true}'export LLAMA_CACHE="unsloth/Qwen3.5-9B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-9B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 0.7 \
--top-p 0.8 \
--top-k 20 \
--min-p 0.00export LLAMA_CACHE="unsloth/Qwen3.5-9B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-9B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 1.0 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.003
hf download unsloth/Qwen3.5-9B-GGUF \
--local-dir unsloth/Qwen3.5-9B-GGUF \
--include "*UD-Q4_K_XL*" # Use "*UD-Q2_K_XL*" for Dynamic 2bit4
./llama.cpp/llama-cli \
--model unsloth/Qwen3.5-9B-GGUF/Qwen3.5-9B-UD-Q4_K_XL.gguf \
--mmproj unsloth/Qwen3.5-9B-GGUF/mmproj-F16.gguf \
--seed 3407 \
--temp 1.0 \
--top-p 0.95 \
--min-p 0.01 \
--top-k 40Qwen3.5-27B
1
apt-get update
apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
git clone https://github.com/ggml-org/llama.cpp
cmake llama.cpp -B llama.cpp/build \
-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON
cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-mtmd-cli llama-server llama-gguf-split
cp llama.cpp/build/bin/llama-* llama.cpp2
export LLAMA_CACHE="unsloth/Qwen3.5-27B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-27B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00export LLAMA_CACHE="unsloth/Qwen3.5-27B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-27B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 1.0 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00export LLAMA_CACHE="unsloth/Qwen3.5-27B-GGUF"
./llama.cpp/llama-server \
-hf unsloth/Qwen3.5-27B-GGUF:Q4_K_M \
--ctx-size 16384 \
--temp 0.7 \
--top-p 0.8 \
--top-k 20 \
--min-p 0.00 \
--chat-template-kwargs '{"enable_thinking":false}'export LLAMA_CACHE="unsloth/Qwen3.5-27B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-27B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 1.0 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00 \
--chat-template-kwargs '{"enable_thinking":false}'3
hf download unsloth/Qwen3.5-27B-GGUF \
--local-dir unsloth/Qwen3.5-27B-GGUF \
--include "*UD-Q4_K_XL*" # Use "*UD-Q2_K_XL*" for Dynamic 2bit4
./llama.cpp/llama-cli \
--model unsloth/Qwen3.5-27B-GGUF/Qwen3.5-27B-UD-Q4_K_XL.gguf \
--mmproj unsloth/Qwen3.5-27B-GGUF/mmproj-F16.gguf \
--seed 3407 \
--temp 1.0 \
--top-p 0.95 \
--min-p 0.01 \
--top-k 40Qwen3.5-122B-A10B
1
apt-get update
apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
git clone https://github.com/ggml-org/llama.cpp
cmake llama.cpp -B llama.cpp/build \
-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON
cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-mtmd-cli llama-server llama-gguf-split
cp llama.cpp/build/bin/llama-* llama.cpp2
export LLAMA_CACHE="unsloth/Qwen3.5-122B-A10B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-122B-A10B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00export LLAMA_CACHE="unsloth/Qwen3.5-122B-A10B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-122B-A10B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 1.0 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00export LLAMA_CACHE="unsloth/Qwen3.5-122B-A10B-GGUF"
./llama.cpp/llama-server \
-hf unsloth/Qwen3.5-122B-A10B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 0.7 \
--top-p 0.8 \
--top-k 20 \
--min-p 0.00 \
--chat-template-kwargs '{"enable_thinking":false}'export LLAMA_CACHE="unsloth/Qwen3.5-122B-A10B-GGUF"
./llama.cpp/llama-server \
-hf unsloth/Qwen3.5-122B-A10B-GGUF:UD-Q4_K_XL \
--ctx-size 16384 \
--temp 1.0 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00 \
--chat-template-kwargs '{"enable_thinking":false}'3
hf download unsloth/Qwen3.5-122B-A10B-GGUF \
--local-dir unsloth/Qwen3.5-122B-A10B-GGUF \
--include "*UD-Q4_K_XL*" # Use "*UD-Q2_K_XL*" for Dynamic 2bit4
./llama.cpp/llama-cli \
--model unsloth/Qwen3.5-122B-A10B-GGUF/UD-Q4_K_XL/Qwen3.5-122B-A10B-UD-Q4_K_XL-00001-of-00003.gguf \
--mmproj unsloth/Qwen3.5-122B-A10B-GGUF/mmproj-F16.gguf \
--ctx-size 16384 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00
--seed 3407Qwen3.5-397B-A17B
1
apt-get update
apt-get install pciutils build-essential cmake curl libcurl4-openssl-dev -y
git clone https://github.com/ggml-org/llama.cpp
cmake llama.cpp -B llama.cpp/build \
-DBUILD_SHARED_LIBS=OFF -DGGML_CUDA=ON
cmake --build llama.cpp/build --config Release -j --clean-first --target llama-cli llama-mtmd-cli llama-server llama-gguf-split
cp llama.cpp/build/bin/llama-* llama.cpp2
export LLAMA_CACHE="unsloth/Qwen3.5-397B-A17B-GGUF"
./llama.cpp/llama-cli \
-hf unsloth/Qwen3.5-397B-A17B-GGUF:Q4_K_M \
--ctx-size 16384 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00export LLAMA_CACHE="unsloth/Qwen3.5-397B-A17B-GGUF"
./llama.cpp/llama-server \
-hf unsloth/Qwen3.5-397B-A17B-GGUF:Q4_K_M \
--ctx-size 16384 \
--temp 0.7 \
--top-p 0.8 \
--top-k 20 \
--min-p 0.00 \
--chat-template-kwargs '{"enable_thinking":false}'3
hf download unsloth/Qwen3.5-397B-A17B-GGUF \
--local-dir unsloth/Qwen3.5-397B-A17B-GGUF \
--include "*Q4_K_M*" # Use "*UD-Q2_K_XL*" for Dynamic 2bit4
./llama.cpp/llama-cli \
--model unsloth/Qwen3.5-397B-A17B-GGUF/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00001-of-00006.gguf \
--mmproj unsloth/Qwen3.5-397B-A17B-GGUF/mmproj-F16.gguf \
--ctx-size 16384 \
--temp 0.6 \
--top-p 0.95 \
--top-k 20 \
--min-p 0.00
--seed 3407👾 LM Studio Guide
1
2
lms get unsloth/qwen3.5-4b3
🦙 Llama-server serving & OpenAI's completion library
./llama.cpp/llama-server \
--model unsloth/Qwen3.5-35B-A3B-GGUF/Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf \
--mmproj unsloth/Qwen3.5-35B-A3B-GGUF/mmproj-F16.gguf \
--alias "unsloth/Qwen3.5-35B-A3B" \
--temp 0.6 \
--top-p 0.95 \
--ctx-size 16384 \
--top-k 20 \
--min-p 0.00 \
--port 8001from openai import OpenAI
import json
openai_client = OpenAI(
base_url = "http://127.0.0.1:8001/v1",
api_key = "sk-no-key-required",
)
completion = openai_client.chat.completions.create(
model = "unsloth/Qwen3.5-397B-A17B",
messages = [{"role": "user", "content": "Create a Snake game."},],
)
print(completion.choices[0].message.content)🤔 How to enable or disable reasoning & thinking
./llama.cpp/llama-server \
--model unsloth/Qwen3.5-9B-GGUF/Qwen3.5-9B-BF16.gguf \
--alias "unsloth/Qwen3.5-9B-GGUF" \
--temp 0.6 \
--top-p 0.95 \
--ctx-size 16384 \
--top-k 20 \
--min-p 0.00 \
--port 8001 \
--chat-template-kwargs '{"enable_thinking":true}'from openai import OpenAI
import json
openai_client = OpenAI(
base_url = "http://127.0.0.1:8001/v1",
api_key = "sk-no-key-required",
)
completion = openai_client.chat.completions.create(
model = "unsloth/Qwen3.5-9B-GGUF",
messages = [{"role": "user", "content": "What is 2+2?"},],
)
print(completion.choices[0].message.content)
print(completion.choices[0].message.reasoning_content)👨💻 OpenAI Codex & Claude Code
🔨Tool Calling with Qwen3.5
📊 Benchmarks
Unsloth GGUF Benchmarks
Qwen3.5-397B-A17B Benchmarks
Official Qwen Benchmarks
Qwen3.5-35B-A3B, 27B and 122B-A10B Benchmarks
Qwen3.5-4B and 9B Benchmarks
Qwen3.5-397B-A17B Benchmarks
Last updated
Was this helpful?