On Thor if you encounter an error like:
CUDA error (/opt/xformers/third_party/flash-attention/hopper/flash_fwd_launch_template.h:160): no kernel image is available for execution on the device. "NotImplementedError: VLLM_USE_V1=1 is not supported with VLLM_ATTENTION_BACKEND=XFORMERS
File "/usr/local/lib/python3.12/dist-packages/vllm/attention/selector.py", line 200, in _cached_get_attn_backend
raise ValueError(
ValueError: Invalid attention backend. Valid backends are: ['FLASH_ATTN', 'FLASH_ATTN_VLLM_V1', 'TRITON_ATTN_VLLM_V1', 'XFORMERS', 'ROCM_FLASH', 'ROCM_AITER_MLA', 'ROCM_AITER_MLA_VLLM_V1', 'ROCM_AITER_FA', 'TORCH_SDPA', 'FLASHINFER', 'FLASHINFER_VLLM_V1', 'TRITON_MLA', 'TRITON_MLA_VLLM_V1', 'FLASHMLA_VLLM_V1', 'FLASHMLA', 'CUTLASS_MLA', 'PALLAS', 'PALLAS_VLLM_V1', 'IPEX', 'DUAL_CHUNK_FLASH_ATTN', 'DIFFERENTIAL_FLASH_ATTN', 'NO_ATTENTION', 'FLEX_ATTENTION', 'TREE_ATTN', 'XFORMERS_VLLM_V1']
You could try following.
# On Thor
git clone https://github.com/Dao-AILab/flash-attention.git
cd flash-attention
git submodule sync
git submodule update --init --recursive
mkdir -p "$HOME/.cache"/{huggingface,vllm,flashinfer}
# Added swap to not run out of memory, I made mine 32gb / I've got a 4tb nvme drive.
sudo fallocate -l 16G /swapfile
sudo chmod 600 /swapfile
sudo mkswap /swapfile
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
sudo swapon -a
# Synchronize cached writes to persistent storage. Then clear them.
sudo sync && echo 3 |sudo tee /proc/sys/vm/drop_caches
# Start the base docker VLLM image
docker run -it --rm --net=host --name vllm1 --runtime nvidia --privileged \
--ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --shm-size=4g \
-v $HOME/.cache:/root/.cache \
-v $PWD:/workspace \
--workdir /workspace \
-e $HF_TOKEN \
nvcr.io/nvidia/vllm:25.09-py3 bash
Now In VLLM docker container
pushd /usr/local/lib/python3.12/dist-packages
cp -pr xformers xformers_backup
popd
pip install -U pip wheel ninja packaging
pip uninstall -y flash-attn xformers
# Build FlashAttention from source for Thor (sm_110) to compile \
# flash_attn_2_cuda.cpython-312-aarch64-linux-gnu.so engine.
export CUDA_HOME=/usr/local/cuda
export TORCH_CUDA_ARCH_LIST="11.0+PTX"
export FLASH_ATTN_CUDA_ARCHS="110"
export FLASH_ATTENTION_FORCE_BUILD=TRUE
# Really 8 or your Thor may lock
export MAX_JOBS=8
export USE_NINJA=1
export BUILD_TARGET="cuda"
python3 setup.py install
# Then create a wheel to save.
MAX_JOBS=8 python3 -m pip wheel . -v -w dist
pushd /usr/local/lib/python3.12/dist-packages
mv xformers_backup xformers
popd
#Keep this container running.
# On Thor, in a second terminal save the new image
docker commit vllm1 vllm:flashattn
# Back in vllm1 container. Exit it.
exit
Now start VLLM
# Commit memory and then clear it.
sudo sync && echo 3 |sudo tee /proc/sys/vm/drop_caches
# Choose your desired --model and substitute below. Here we use \
# nvidia/Llama-3.1-8B-Instruct-FP4 to use less memory & test FP4 model.
# A. Is for most use case.
docker run --name vllm --rm -it --network host \
--runtime=nvidia --gpus all --ipc=host \
--ulimit memlock=-1 --ulimit stack=67108864 --shm-size=4g \
-e VLLM_USE_V1=1 -e VLLM_WORKER_MULTIPROC=0 \
-e HF_HOME=/root/.cache/huggingface \
-v "$HOME/.cache:/root/.cache" \
vllm:flashattn \
python3 -m vllm.entrypoints.openai.api_server \
--model nvidia/Llama-3.1-8B-Instruct-FP4 \
--download-dir /root/.cache/huggingface \
--host 0.0.0.0 --port 8000 \
--tensor-parallel-size 1 \
--max-model-len 512 \
--max-num-seqs 2 \
--gpu-memory-utilization 0.30 \
--kv-cache-dtype=auto \
--enforce-eager \
--chat-template-content-format string
# and/or B. Enable CUDA graphs. This will do a little bit of compilation.
docker run --name vllm --rm -it --network host \
--runtime=nvidia --gpus all --ipc=host \
--ulimit memlock=-1 --ulimit stack=67108864 --shm-size=4g \
-e VLLM_USE_V1=1 -e VLLM_WORKER_MULTIPROC=0 \
-e HF_HOME=/root/.cache/huggingface \
-v "$HOME/.cache:/root/.cache" \
vllm:flashattn \
python3 -m vllm.entrypoints.openai.api_server \
--model nvidia/Llama-3.1-8B-Instruct-FP4 \
--download-dir /root/.cache/huggingface/hub \
--host 0.0.0.0 --port 8000 \
--tensor-parallel-size 1 \
--max-model-len 512 \
--max-num-seqs 2 \
--gpu-memory-utilization 0.30 \
--kv-cache-dtype=auto
# Following is one method to interact with VLLM. If wanted change the question.
curl -X 'POST' 'http://127.0.0.1:8000/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
"model": "nvidia/Llama-3.1-8B-Instruct-FP4",
"messages": [{"role":"user", "content": "What are Chihuahuas famous for?"}]
}' |jq
When done with VLLM ‘control-c’ will
(APIServer pid=1) INFO: Shutting down
(APIServer pid=1) INFO: Waiting for application shutdown.
(APIServer pid=1) INFO: Application shutdown complete.