Llama.cpp GLM 4.7 Flash Benchmark

I’ve been experimenting with an early branch of llama.cpp using GLM 4.7 Flash with the FA fix (CUDA: add gqa_ratio 4 for GLM 4.7 flash by am17an · Pull Request #18953 · ggml-org/llama.cpp · GitHub), and I’ve noticed it generates tokens faster. However, the prompt processing still isn’t as quick as vLLM, and I’m not sure if I’m missing something there. It also seems like the main llama.cpp branch doesn’t include some of these optimizations yet, even though the fix has already been merged.

./build/bin/llama-server
–hf-repo unsloth/GLM-4.7-Flash-GGUF
–host 0.0.0.0
–port 41447
-c 524288
-ngl 999
-t 20
-np 4
–cont-batching
-ub 2048
-fa 1
–jinja
–reasoning-format auto
–no-mmap

uvx llama-benchy
–base-url http://localhost:41447/v1
–model unsloth/glm-4.7-flash-GGUF
–pp 2048
–tg 32
–depth 0 4096 8192 16384 32768 65535 100000
–runs 1
–enable-prefix-caching
–latency-mode generation

model test t/s ttfr (ms) est_ppt (ms) e2e_ttft (ms)
unsloth/glm-4.7-flash-GGUF pp2048 2026.82 ± 0.00 983.58 ± 0.00 955.19 ± 0.00 983.62 ± 0.00
unsloth/glm-4.7-flash-GGUF tg32 52.46 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_pp @ d4096 1768.13 ± 0.00 2009.01 ± 0.00 1980.62 ± 0.00 2009.06 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_tg @ d4096 55.43 ± 0.00
unsloth/glm-4.7-flash-GGUF pp2048 @ d4096 1502.19 ± 0.00 1391.74 ± 0.00 1363.35 ± 0.00 1391.77 ± 0.00
unsloth/glm-4.7-flash-GGUF tg32 @ d4096 47.40 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_pp @ d8192 1354.72 ± 0.00 5271.56 ± 0.00 5243.17 ± 0.00 5271.62 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_tg @ d8192 44.99 ± 0.00
unsloth/glm-4.7-flash-GGUF pp2048 @ d8192 987.55 ± 0.00 2102.21 ± 0.00 2073.82 ± 0.00 2102.25 ± 0.00
unsloth/glm-4.7-flash-GGUF tg32 @ d8192 47.63 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_pp @ d16384 925.85 ± 0.00 15896.01 ± 0.00 15867.62 ± 0.00 15896.04 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_tg @ d16384 41.33 ± 0.00
unsloth/glm-4.7-flash-GGUF pp2048 @ d16384 600.23 ± 0.00 3440.40 ± 0.00 3412.01 ± 0.00 3440.43 ± 0.00
unsloth/glm-4.7-flash-GGUF tg32 @ d16384 40.08 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_pp @ d32768 569.48 ± 0.00 51472.06 ± 0.00 51443.67 ± 0.00 51472.08 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_tg @ d32768 17.39 ± 0.00
unsloth/glm-4.7-flash-GGUF pp2048 @ d32768 336.61 ± 0.00 6112.50 ± 0.00 6084.11 ± 0.00 6112.53 ± 0.00
unsloth/glm-4.7-flash-GGUF tg32 @ d32768 16.63 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_pp @ d65535 317.69 ± 0.00 182461.28 ± 0.00 182432.89 ± 0.00 182461.32 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_tg @ d65535 11.02 ± 0.00
unsloth/glm-4.7-flash-GGUF pp2048 @ d65535 184.63 ± 0.00 11121.08 ± 0.00 11092.69 ± 0.00 11121.11 ± 0.00
unsloth/glm-4.7-flash-GGUF tg32 @ d65535 10.33 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_pp @ d100000 218.32 ± 0.00 405616.25 ± 0.00 405587.86 ± 0.00 405616.27 ± 0.00
unsloth/glm-4.7-flash-GGUF ctx_tg @ d100000 7.68 ± 0.00
unsloth/glm-4.7-flash-GGUF pp2048 @ d100000 122.08 ± 0.00 16803.78 ± 0.00 16775.39 ± 0.00 16803.81 ± 0.00
unsloth/glm-4.7-flash-GGUF tg32 @ d100000 7.56 ± 0.00

./build/bin/llama-server
–hf-repo noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF
–host 0.0.0.0
–port 41447
-c 524288
-ngl 999
-t 20
-np 4
–cont-batching
-ub 2048
-fa 1
–jinja
–reasoning-format auto
–no-mmap

uvx llama-benchy
–base-url http://localhost:41447/v1
–model noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF
–pp 2048
–tg 32
–depth 0 4096 8192 16384 32768 65535 100000
–runs 1
–enable-prefix-caching
–latency-mode generation

model test t/s ttfr (ms) est_ppt (ms) e2e_ttft (ms)
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF pp2048 2254.97 ± 0.00 868.29 ± 0.00 841.70 ± 0.00 868.34 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF tg32 52.44 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_pp @ d4096 1909.84 ± 0.00 1953.46 ± 0.00 1926.86 ± 0.00 1953.49 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_tg @ d4096 45.53 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF pp2048 @ d4096 1471.85 ± 0.00 1418.04 ± 0.00 1391.45 ± 0.00 1418.08 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF tg32 @ d4096 43.53 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_pp @ d8192 1431.18 ± 0.00 5168.51 ± 0.00 5141.91 ± 0.00 5168.53 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_tg @ d8192 44.16 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF pp2048 @ d8192 1022.83 ± 0.00 2028.88 ± 0.00 2002.28 ± 0.00 2028.92 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF tg32 @ d8192 39.51 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_pp @ d16384 986.46 ± 0.00 14576.67 ± 0.00 14550.08 ± 0.00 14576.71 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_tg @ d16384 35.30 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF pp2048 @ d16384 677.58 ± 0.00 3049.12 ± 0.00 3022.53 ± 0.00 3049.16 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF tg32 @ d16384 36.21 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_pp @ d32768 592.33 ± 0.00 48857.25 ± 0.00 48830.66 ± 0.00 48857.29 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_tg @ d32768 15.79 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF pp2048 @ d32768 338.26 ± 0.00 6081.19 ± 0.00 6054.59 ± 0.00 6081.22 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF tg32 @ d32768 17.95 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_pp @ d65535 323.80 ± 0.00 179278.38 ± 0.00 179251.79 ± 0.00 179278.42 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_tg @ d65535 10.10 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF pp2048 @ d65535 184.81 ± 0.00 11108.39 ± 0.00 11081.79 ± 0.00 11108.42 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF tg32 @ d65535 10.88 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_pp @ d100000 220.53 ± 0.00 402260.75 ± 0.00 402234.15 ± 0.00 402260.79 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF ctx_tg @ d100000 7.00 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF pp2048 @ d100000 112.32 ± 0.00 18260.09 ± 0.00 18233.50 ± 0.00 18260.15 ± 0.00
noctrex/GLM-4.7-Flash-MXFP4_MOE-GGUF tg32 @ d100000 6.94 ± 0.00