Thank you so much @eugr , you’ve been a great help!
I wanted to share my Dockerfile with everyone; it works perfectly. Have fun! :)
Dockerfile
FROM nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04
# Install essentials
RUN apt-get update && apt-get install -y \
python3.12 python3.12-venv python3-pip git wget patch \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Create virtual env
RUN python3.12 -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Upgrade pip
RUN pip install --upgrade pip
# Install PyTorch + CUDA
RUN pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu130
# Install pre-release deps
RUN pip install xgrammar triton flashinfer-python --pre
# Clone vLLM
RUN git clone https://github.com/vllm-project/vllm.git
WORKDIR /app/vllm
RUN python3 use_existing_torch.py
RUN pip install -r requirements/build.txt
# Apply patch
COPY vllm_patch.diff .
RUN patch -p1 < vllm_patch.diff
RUN apt-get update && apt-get install -y \
cmake \
build-essential \
ninja-build \
&& rm -rf /var/lib/apt/lists/*
# Set essential environment variables
ENV TORCH_CUDA_ARCH_LIST=12.0f
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
ENV TIKTOKEN_ENCODINGS_BASE=/app/tiktoken_encodings
# Install vLLM with local build
RUN pip install --no-build-isolation -e . -v --pre
# Download tiktoken encodings
WORKDIR /app
RUN mkdir -p tiktoken_encodings && \
wget -O tiktoken_encodings/o200k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken" && \
wget -O tiktoken_encodings/cl100k_base.tiktoken "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken"
WORKDIR /app/vllm
RUN pip install vllm[audio]
# Expose port
EXPOSE 8888
ENTRYPOINT []
vllm_patch.diff (thanks again @eugr for this patch )
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cb94f919..f860e533e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -594,9 +594,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# FP4 Archs and flags
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
- cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0f" "${CUDA_ARCHS}")
else()
- cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
@@ -668,7 +668,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
else()
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
endif()
@@ -716,9 +716,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f" "${CUDA_ARCHS}")
else()
- cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
+ cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a" "${CUDA_ARCHS}")
endif()
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")