Tried to run on my docker setup with freshly built vllm from main branch - no bueno. It loads (with flashinfer errors during loading), but produces complete garbage as an output. Errors look like this:
(EngineCore_DP0 pid=294) 2025-12-16 07:15:12,481 - WARNING - autotuner.py:490 - flashinfer.jit: [Autotuner]: Skipping tactic <flashinfer.fused_moe.core.get_cutlass_fused_moe_module.<locals>.MoERunner object at 0xfb9e3269aea0> 41, due to failure while profiling: [TensorRT-LLM][ERROR] Assertion failed: GPU lacks the shared memory resources to run GroupedGEMM kernel (/workspace/csrc/nv_internal/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h:175)
(EngineCore_DP0 pid=294) 1 0xfb9e194a5eb4 tensorrt_llm::common::throwRuntimeError(char const*, int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 84
(EngineCore_DP0 pid=294) 2 0xfb9e195d04f0 void tensorrt_llm::kernels::cutlass_kernels_oss::dispatchGemmConfig<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, cutlass::arch::Sm89, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, cutlass::gemm::GemmShape<16, 256, 128>, cutlass::gemm::GemmShape<16, 64, 128>, (void*)0>(tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>, int) + 2400
(EngineCore_DP0 pid=294) 3 0xfb9e195d8394 void tensorrt_llm::kernels::cutlass_kernels_oss::dispatchMoeGemmToCutlass<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, cutlass::arch::Sm89, tensorrt_llm::cutlass_extensions::EpilogueOpDefault, (void*)0>(tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>, int) + 356
(EngineCore_DP0 pid=294) 4 0xfb9e195d86f0 void tensorrt_llm::kernels::cutlass_kernels::MoeGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>::dispatchToArch<tensorrt_llm::cutlass_extensions::EpilogueOpDefault>(tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>, tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput) + 320
(EngineCore_DP0 pid=294) 5 0xfb9e195d8ffc tensorrt_llm::kernels::cutlass_kernels::MoeGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>::moeGemmBiasAct(tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>, tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput) + 508
(EngineCore_DP0 pid=294) 6 0xfb9e19954dcc tensorrt_llm::kernels::cutlass_kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16, void>::gemm2(tensorrt_llm::kernels::cutlass_kernels::MoeGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>&, tensorrt_llm::kernels::fp8_blockscale_gemm::CutlassFp8BlockScaleGemmRunnerInterface*, __nv_fp8_e4m3 const*, void*, __nv_bfloat16*, long const*, tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput, __nv_fp8_e4m3 const*, __nv_bfloat16 const*, __nv_bfloat16 const*, float const*, unsigned char const*, tensorrt_llm::kernels::cutlass_kernels::QuantParams, float const*, float const*, int const*, int const*, int const*, long const*, long, long, long, long, long, int, long, float const**, bool, void*, CUstream_st*, tensorrt_llm::kernels::cutlass_kernels::MOEParallelismConfig, bool, tensorrt_llm::cutlass_extensions::CutlassGemmConfig, bool, int*, int*, bool) + 716
(EngineCore_DP0 pid=294) 7 0xfb9e19955240 tensorrt_llm::kernels::cutlass_kernels::CutlassMoeFCRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16, void>::gemm2(void const*, void*, void*, long const*, tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput, void const*, void const*, void const*, float const*, unsigned char const*, tensorrt_llm::kernels::cutlass_kernels::QuantParams, float const*, float const*, int const*, int const*, int const*, long const*, long, long, long, long, long, int, long, float const**, bool, void*, bool, CUstream_st*, tensorrt_llm::kernels::cutlass_kernels::MOEParallelismConfig, bool, tensorrt_llm::cutlass_extensions::CutlassGemmConfig, bool, int*, int*, bool) + 400
(EngineCore_DP0 pid=294) 8 0xfb9e198ec9f8 tensorrt_llm::kernels::cutlass_kernels::GemmProfilerBackend::runProfiler(int, tensorrt_llm::cutlass_extensions::CutlassGemmConfig const&, char*, void const*, bool, CUstream_st* const&) + 2552
(EngineCore_DP0 pid=294) 9 0xfb9e19897318 /usr/local/lib/python3.12/dist-packages/flashinfer_jit_cache/jit_cache/fused_moe_120/fused_moe_120.so(+0x567318) [0xfb9e19897318]
(EngineCore_DP0 pid=294) 10 0xfb9e198bf074 /usr/local/lib/python3.12/dist-packages/flashinfer_jit_cache/jit_cache/fused_moe_120/fused_moe_120.so(+0x58f074) [0xfb9e198bf074]
(EngineCore_DP0 pid=294) 11 0xfb9e19894608 tvm::ffi::details::FunctionObjImpl<tvm::ffi::Function::FromTyped<FusedMoeRunner::GetFunction(tvm::ffi::String const&)::{lambda(tvm::ffi::TensorView, tvm::ffi::TensorView, tvm::ffi::Optional<tvm::ffi::TensorView, void>, tvm::ffi::TensorView, tvm::ffi::Optional<tvm::ffi::TensorView, void>, long, long, long, long, long, long, long, bool, bool, long, long, bool, bool, long)#1}>(FusedMoeRunner::GetFunction(tvm::ffi::String const&)::{lambda(tvm::ffi::TensorView, tvm::ffi::TensorView, tvm::ffi::Optional<tvm::ffi::TensorView, void>, tvm::ffi::TensorView, tvm::ffi::Optional<tvm::ffi::TensorView, void>, long, long, long, long, long, long, long, bool, bool, long, long, bool, bool, long)#1}&&)::{lambda(tvm::ffi::AnyView const*, int, tvm::ffi::Any*)#1}>::SafeCall(void*, TVMFFIAny const*, int, TVMFFIAny*) + 696
(EngineCore_DP0 pid=294) 12 0xfb9e9c43231c /usr/local/lib/python3.12/dist-packages/tvm_ffi/core.abi3.so(+0x5231c) [0xfb9e9c43231c]
(EngineCore_DP0 pid=294) 13 0x4c2e78 _PyObject_MakeTpCall + 120
(EngineCore_DP0 pid=294) 14 0x564f28 _PyEval_EvalFrameDefault + 2292
(EngineCore_DP0 pid=294) 15 0x4c7088 VLLM::EngineCore() [0x4c7088]
(EngineCore_DP0 pid=294) 16 0x4c5408 PyObject_Call + 280
(EngineCore_DP0 pid=294) 17 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 18 0x4c4a44 _PyObject_Call_Prepend + 436
(EngineCore_DP0 pid=294) 19 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 20 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 21 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 22 0x4c7088 VLLM::EngineCore() [0x4c7088]
(EngineCore_DP0 pid=294) 23 0x4c5408 PyObject_Call + 280
(EngineCore_DP0 pid=294) 24 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 25 0x4c7088 VLLM::EngineCore() [0x4c7088]
(EngineCore_DP0 pid=294) 26 0x4c5408 PyObject_Call + 280
(EngineCore_DP0 pid=294) 27 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 28 0x4c7088 VLLM::EngineCore() [0x4c7088]
(EngineCore_DP0 pid=294) 29 0x4c5408 PyObject_Call + 280
(EngineCore_DP0 pid=294) 30 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 31 0x4c4a44 _PyObject_Call_Prepend + 436
(EngineCore_DP0 pid=294) 32 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 33 0x4c2f30 _PyObject_MakeTpCall + 304
(EngineCore_DP0 pid=294) 34 0x564f28 _PyEval_EvalFrameDefault + 2292
(EngineCore_DP0 pid=294) 35 0xfba179d6d020 /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0xccd020) [0xfba179d6d020]
(EngineCore_DP0 pid=294) 36 0xfba17a117ed4 /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0x1077ed4) [0xfba17a117ed4]
(EngineCore_DP0 pid=294) 37 0xfba17497b75c /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so(+0x636b75c) [0xfba17497b75c]
(EngineCore_DP0 pid=294) 38 0xfba179e82b60 /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0xde2b60) [0xfba179e82b60]
(EngineCore_DP0 pid=294) 39 0xfba179e830f0 /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0xde30f0) [0xfba179e830f0]
(EngineCore_DP0 pid=294) 40 0xfba179d61860 /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0xcc1860) [0xfba179d61860]
(EngineCore_DP0 pid=294) 41 0xfba179677cac /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_python.so(+0x5d7cac) [0xfba179677cac]
(EngineCore_DP0 pid=294) 42 0x503884 VLLM::EngineCore() [0x503884]
(EngineCore_DP0 pid=294) 43 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 44 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 45 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 46 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 47 0x4c2e78 _PyObject_MakeTpCall + 120
(EngineCore_DP0 pid=294) 48 0x564f28 _PyEval_EvalFrameDefault + 2292
(EngineCore_DP0 pid=294) 49 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 50 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 51 0x4c2e78 _PyObject_MakeTpCall + 120
(EngineCore_DP0 pid=294) 52 0x564f28 _PyEval_EvalFrameDefault + 2292
(EngineCore_DP0 pid=294) 53 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 54 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 55 0x4c2e78 _PyObject_MakeTpCall + 120
(EngineCore_DP0 pid=294) 56 0x564f28 _PyEval_EvalFrameDefault + 2292
(EngineCore_DP0 pid=294) 57 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 58 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 59 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 60 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 61 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 62 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 63 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 64 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 65 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 66 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 67 0x4c2e78 _PyObject_MakeTpCall + 120
(EngineCore_DP0 pid=294) 68 0x564f28 _PyEval_EvalFrameDefault + 2292
(EngineCore_DP0 pid=294) 69 0x4c6fac VLLM::EngineCore() [0x4c6fac]
(EngineCore_DP0 pid=294) 70 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 71 0x4c6fac VLLM::EngineCore() [0x4c6fac]
(EngineCore_DP0 pid=294) 72 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 73 0x4c6fac VLLM::EngineCore() [0x4c6fac]
(EngineCore_DP0 pid=294) 74 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 75 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 76 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 77 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 78 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 79 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 80 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 81 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 82 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 83 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 84 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 85 0x4c2e78 _PyObject_MakeTpCall + 120
(EngineCore_DP0 pid=294) 86 0x564f28 _PyEval_EvalFrameDefault + 2292
(EngineCore_DP0 pid=294) 87 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 88 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 89 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 90 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 91 0x4c4954 _PyObject_Call_Prepend + 196
(EngineCore_DP0 pid=294) 92 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 93 0x4c2e78 _PyObject_MakeTpCall + 120
(EngineCore_DP0 pid=294) 94 0x564f28 _PyEval_EvalFrameDefault + 2292
(EngineCore_DP0 pid=294) 95 0x4c7088 VLLM::EngineCore() [0x4c7088]
(EngineCore_DP0 pid=294) 96 0x4c5408 PyObject_Call + 280
(EngineCore_DP0 pid=294) 97 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 98 0x4c7088 VLLM::EngineCore() [0x4c7088]
(EngineCore_DP0 pid=294) 99 0x4c5408 PyObject_Call + 280
(EngineCore_DP0 pid=294) 100 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 101 0x4c4a44 _PyObject_Call_Prepend + 436
(EngineCore_DP0 pid=294) 102 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 103 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 104 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 105 0x4c4a44 _PyObject_Call_Prepend + 436
(EngineCore_DP0 pid=294) 106 0x528f50 VLLM::EngineCore() [0x528f50]
(EngineCore_DP0 pid=294) 107 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 108 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 109 0x4c7024 VLLM::EngineCore() [0x4c7024]
(EngineCore_DP0 pid=294) 110 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 111 0x4c4a44 _PyObject_Call_Prepend + 436
(EngineCore_DP0 pid=294) 112 0x5228b0 VLLM::EngineCore() [0x5228b0]
(EngineCore_DP0 pid=294) 113 0x51df74 VLLM::EngineCore() [0x51df74]
(EngineCore_DP0 pid=294) 114 0x4c535c PyObject_Call + 108
(EngineCore_DP0 pid=294) 115 0x56832c _PyEval_EvalFrameDefault + 15608
(EngineCore_DP0 pid=294) 116 0x563224 PyEval_EvalCode + 304
(EngineCore_DP0 pid=294) 117 0x59bfb0 PyRun_StringFlags + 224
(EngineCore_DP0 pid=294) 118 0x67f0d4 PyRun_SimpleStringFlags + 68
(EngineCore_DP0 pid=294) 119 0x68b890 Py_RunMain + 912
(EngineCore_DP0 pid=294) 120 0x68b398 Py_BytesMain + 40
(EngineCore_DP0 pid=294) 121 0xfba2081a84c4 /usr/lib/aarch64-linux-gnu/libc.so.6(+0x284c4) [0xfba2081a84c4]
(EngineCore_DP0 pid=294) 122 0xfba2081a8598 __libc_start_main + 152
(EngineCore_DP0 pid=294) 123 0x5f6bb0 _start + 48
Even tried nightly cu130 pytorch + flashinfer - same result.