thanks @Eugr,
I’ve been looking at your community build, but not have much luck with the first recipe I tried, gpt-oss mxfp4. Looks like the docker image built was okay. When running vllm getting this error. log below, any thoughts?,
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) INFO 03-01 15:08:42 [mxfp4.py:803] [MXFP4] lm_head quantized: torch.Size([100544, 2880]) BF16 -> torch.Size([100544, 1440]) FP4 (4x smaller)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) INFO 03-01 15:08:42 [gpu_model_runner.py:3901] Model loading took 31.9972 GiB memory and 50.021620 seconds
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) INFO 03-01 15:08:43 [decorators.py:432] Directly load AOT compilation from path /root/.cache/vllm/torch_aot_compile/fb5f1d6fe83b24616726df86c6bc4bca2d951023b09c3197cd61ac41306ddb9d/rank_0_0/model
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) INFO 03-01 15:08:44 [backends.py:644] Using cache directory: /root/.cache/vllm/torch_compile_cache/1c5fbe4478/rank_0_0/backbone for vLLM's torch.compile
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) INFO 03-01 15:08:44 [backends.py:704] Dynamo bytecode transform time: 1.20 s
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] EngineCore failed to start.
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] Traceback (most recent call last):
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 891, in run_engine_core
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 656, in __init__
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] super().__init__(
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 112, in __init__
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 243, in _initialize_kv_caches
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] available_gpu_memory = self.model_executor.determine_available_memory()
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 126, in determine_available_memory
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return self.collective_rpc("determine_available_memory")
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/ray_executor.py", line 490, in collective_rpc
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return ray.get(ray_worker_outputs, timeout=timeout)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return fn(*args, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2981, in get
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] values, debugger_breakpoint = worker.get_objects(
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1012, in get_objects
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] raise value.as_instanceof_cause()
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ray.exceptions.RayTaskError(RuntimeError): ray::RayWorkerWrapper.execute_method() (pid=1459, ip=192.168.177.11, actor_id=04131c44ceb9ffd2e878146401000000, repr=<vllm.v1.executor.ray_utils.RayWorkerWrapper object at 0xe90d36aa17c0>)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 339, in execute_method
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] raise e
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 328, in execute_method
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return run_method(self, method, args, kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 322, in determine_available_memory
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] self.model_runner.profile_run()
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4750, in profile_run
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] output = self._dummy_sampler_run(last_hidden_states)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4545, in _dummy_sampler_run
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] logits = self.model.compute_logits(hidden_states)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py", line 726, in compute_logits
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] logits = self.logits_processor(self.lm_head, hidden_states)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 60, in forward
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] logits = self._get_logits(hidden_states, lm_head, embedding_bias)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 92, in _get_logits
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py", line 828, in apply
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return apply_fp4_marlin_linear(
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py", line 123, in apply_fp4_marlin_linear
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] output = ops.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py", line 1348, in gptq_marlin_gemm
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return torch.ops._C.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1205, in __call__
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] return self._op(*args, **kwargs)
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ERROR 03-01 15:08:53 [core.py:900] RuntimeError: Invalid thread config: thread_m_blocks = 1, thread_k = -1, thread_n = -1, num_threads = -1 for MKN = [256, 2880, 100544] and num_bits = 4, prob_m_split = 256, group_size = 32, has_act_order = 0, is_k_full = 1, has_zp = 0, is_zp_float = 0, stages = 4, max_shared_mem_new = 101376
(EngineCore_DP0 pid=1336) Process EngineCore_DP0:
(EngineCore_DP0 pid=1336) Traceback (most recent call last):
(EngineCore_DP0 pid=1336) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=1336) self.run()
(EngineCore_DP0 pid=1336) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=1336) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 904, in run_engine_core
(EngineCore_DP0 pid=1336) raise e
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 891, in run_engine_core
(EngineCore_DP0 pid=1336) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 656, in __init__
(EngineCore_DP0 pid=1336) super().__init__(
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 112, in __init__
(EngineCore_DP0 pid=1336) num_gpu_blocks, num_cpu_blocks, kv_cache_config = self._initialize_kv_caches(
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 243, in _initialize_kv_caches
(EngineCore_DP0 pid=1336) available_gpu_memory = self.model_executor.determine_available_memory()
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 126, in determine_available_memory
(EngineCore_DP0 pid=1336) return self.collective_rpc("determine_available_memory")
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/ray_executor.py", line 490, in collective_rpc
(EngineCore_DP0 pid=1336) return ray.get(ray_worker_outputs, timeout=timeout)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
(EngineCore_DP0 pid=1336) return fn(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
(EngineCore_DP0 pid=1336) return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2981, in get
(EngineCore_DP0 pid=1336) values, debugger_breakpoint = worker.get_objects(
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1012, in get_objects
(EngineCore_DP0 pid=1336) raise value.as_instanceof_cause()
(EngineCore_DP0 pid=1336) ray.exceptions.RayTaskError(RuntimeError): ray::RayWorkerWrapper.execute_method() (pid=1459, ip=192.168.177.11, actor_id=04131c44ceb9ffd2e878146401000000, repr=<vllm.v1.executor.ray_utils.RayWorkerWrapper object at 0xe90d36aa17c0>)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 339, in execute_method
(EngineCore_DP0 pid=1336) raise e
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 328, in execute_method
(EngineCore_DP0 pid=1336) return run_method(self, method, args, kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=1336) return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(EngineCore_DP0 pid=1336) return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 322, in determine_available_memory
(EngineCore_DP0 pid=1336) self.model_runner.profile_run()
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4750, in profile_run
(EngineCore_DP0 pid=1336) output = self._dummy_sampler_run(last_hidden_states)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(EngineCore_DP0 pid=1336) return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4545, in _dummy_sampler_run
(EngineCore_DP0 pid=1336) logits = self.model.compute_logits(hidden_states)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py", line 726, in compute_logits
(EngineCore_DP0 pid=1336) logits = self.logits_processor(self.lm_head, hidden_states)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=1336) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl
(EngineCore_DP0 pid=1336) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 60, in forward
(EngineCore_DP0 pid=1336) logits = self._get_logits(hidden_states, lm_head, embedding_bias)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 92, in _get_logits
(EngineCore_DP0 pid=1336) logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py", line 828, in apply
(EngineCore_DP0 pid=1336) return apply_fp4_marlin_linear(
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py", line 123, in apply_fp4_marlin_linear
(EngineCore_DP0 pid=1336) output = ops.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py", line 1348, in gptq_marlin_gemm
(EngineCore_DP0 pid=1336) return torch.ops._C.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1205, in __call__
(EngineCore_DP0 pid=1336) return self._op(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) RuntimeError: Invalid thread config: thread_m_blocks = 1, thread_k = -1, thread_n = -1, num_threads = -1 for MKN = [256, 2880, 100544] and num_bits = 4, prob_m_split = 256, group_size = 32, has_act_order = 0, is_k_full = 1, has_zp = 0, is_zp_float = 0, stages = 4, max_shared_mem_new = 101376
(EngineCore_DP0 pid=1336) INFO 03-01 15:08:53 [ray_executor.py:121] Shutting down Ray distributed executor. If you see error log from logging.cc regarding SIGTERM received, please ignore because this is the expected termination process in Ray.
(EngineCore_DP0 pid=1336) 2026-03-01 15:08:53,026 ERROR worker.py:439 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::RayWorkerWrapper.execute_method() (pid=385, ip=192.168.177.12, actor_id=5fbcb749f51c18921b4f26f401000000, repr=<vllm.v1.executor.ray_utils.RayWorkerWrapper object at 0xe244ec4df470>)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 339, in execute_method
(EngineCore_DP0 pid=1336) raise e
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 328, in execute_method
(EngineCore_DP0 pid=1336) return run_method(self, method, args, kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=1336) return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(EngineCore_DP0 pid=1336) return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 322, in determine_available_memory
(EngineCore_DP0 pid=1336) self.model_runner.profile_run()
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4750, in profile_run
(EngineCore_DP0 pid=1336) output = self._dummy_sampler_run(last_hidden_states)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(EngineCore_DP0 pid=1336) return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4545, in _dummy_sampler_run
(EngineCore_DP0 pid=1336) logits = self.model.compute_logits(hidden_states)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py", line 726, in compute_logits
(EngineCore_DP0 pid=1336) logits = self.logits_processor(self.lm_head, hidden_states)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=1336) return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl
(EngineCore_DP0 pid=1336) return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 60, in forward
(EngineCore_DP0 pid=1336) logits = self._get_logits(hidden_states, lm_head, embedding_bias)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 92, in _get_logits
(EngineCore_DP0 pid=1336) logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py", line 828, in apply
(EngineCore_DP0 pid=1336) return apply_fp4_marlin_linear(
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py", line 123, in apply_fp4_marlin_linear
(EngineCore_DP0 pid=1336) output = ops.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py", line 1348, in gptq_marlin_gemm
(EngineCore_DP0 pid=1336) return torch.ops._C.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1205, in __call__
(EngineCore_DP0 pid=1336) return self._op(*args, **kwargs)
(EngineCore_DP0 pid=1336) ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) RuntimeError: Invalid thread config: thread_m_blocks = 1, thread_k = -1, thread_n = -1, num_threads = -1 for MKN = [256, 2880, 100544] and num_bits = 4, prob_m_split = 256, group_size = 32, has_act_order = 0, is_k_full = 1, has_zp = 0, is_zp_float = 0, stages = 4, max_shared_mem_new = 101376
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) INFO 03-01 15:08:52 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 6.811 s
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) INFO 03-01 15:08:52 [monitor.py:34] torch.compile takes 8.01 s in total
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] Error executing method 'determine_available_memory'. This might cause deadlock in distributed execution.
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] Traceback (most recent call last):
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 328, in execute_method
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return run_method(self, method, args, kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 322, in determine_available_memory
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] self.model_runner.profile_run()
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4750, in profile_run
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] output = self._dummy_sampler_run(last_hidden_states)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return func(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4545, in _dummy_sampler_run
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] logits = self.model.compute_logits(hidden_states)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py", line 726, in compute_logits
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] logits = self.logits_processor(self.lm_head, hidden_states)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 60, in forward
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] logits = self._get_logits(hidden_states, lm_head, embedding_bias)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 92, in _get_logits
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py", line 828, in apply
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return apply_fp4_marlin_linear(
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py", line 123, in apply_fp4_marlin_linear
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] output = ops.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py", line 1348, in gptq_marlin_gemm
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return torch.ops._C.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1205, in __call__
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] return self._op(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) ERROR 03-01 15:08:53 [worker_base.py:338] RuntimeError: Invalid thread config: thread_m_blocks = 1, thread_k = -1, thread_n = -1, num_threads = -1 for MKN = [256, 2880, 100544] and num_bits = 4, prob_m_split = 256, group_size = 32, has_act_order = 0, is_k_full = 1, has_zp = 0, is_zp_float = 0, stages = 4, max_shared_mem_new = 101376
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=1459) INFO 03-01 15:08:41 [default_loader.py:291] Loading weights took 36.76 seconds
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) INFO 03-01 15:08:42 [mxfp4.py:803] [MXFP4] lm_head quantized: torch.Size([100544, 2880]) BF16 -> torch.Size([100544, 1440]) FP4 (4x smaller)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) INFO 03-01 15:08:42 [gpu_model_runner.py:3901] Model loading took 31.9972 GiB memory and 50.095736 seconds
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) INFO 03-01 15:08:43 [decorators.py:432] Directly load AOT compilation from path /root/.cache/vllm/torch_aot_compile/fb5f1d6fe83b24616726df86c6bc4bca2d951023b09c3197cd61ac41306ddb9d/rank_1_0/model
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) INFO 03-01 15:08:45 [backends.py:644] Using cache directory: /root/.cache/vllm/torch_compile_cache/1c5fbe4478/rank_1_0/backbone for vLLM's torch.compile
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) INFO 03-01 15:08:45 [backends.py:704] Dynamo bytecode transform time: 2.22 s
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) INFO 03-01 15:08:52 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 6.239 s
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) INFO 03-01 15:08:52 [monitor.py:34] torch.compile takes 8.46 s in total
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] Error executing method 'determine_available_memory'. This might cause deadlock in distributed execution.
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] Traceback (most recent call last):
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 328, in execute_method
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] return run_method(self, method, args, kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/serial_utils.py", line 461, in run_method
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] return func(*args, **kwargs) [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^ [repeated 4x across cluster]
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 124, in decorate_context [repeated 2x across cluster]
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 322, in determine_available_memory
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] self.model_runner.profile_run()
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4750, in profile_run
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] output = self._dummy_sampler_run(last_hidden_states)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4545, in _dummy_sampler_run
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] logits = self.model.compute_logits(hidden_states)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py", line 726, in compute_logits
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] logits = self.logits_processor(self.lm_head, hidden_states)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1776, in _wrapped_call_impl
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] return self._call_impl(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1787, in _call_impl
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] return forward_call(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 60, in forward
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] logits = self._get_logits(hidden_states, lm_head, embedding_bias)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 92, in _get_logits
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] logits = lm_head.quant_method.apply(lm_head, hidden_states, bias=embedding_bias)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py", line 828, in apply
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] return apply_fp4_marlin_linear(
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py", line 123, in apply_fp4_marlin_linear
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] output = ops.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py", line 1348, in gptq_marlin_gemm
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] return torch.ops._C.gptq_marlin_gemm(
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1205, in __call__
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] return self._op(*args, **kwargs)
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] ^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=1336) (RayWorkerWrapper pid=385, ip=192.168.177.12) ERROR 03-01 15:08:53 [worker_base.py:338] RuntimeError: Invalid thread config: thread_m_blocks = 1, thread_k = -1, thread_n = -1, num_threads = -1 for MKN = [256, 2880, 100544] and num_bits = 4, prob_m_split = 256, group_size = 32, has_act_order = 0, is_k_full = 1, has_zp = 0, is_zp_float = 0, stages = 4, max_shared_mem_new = 101376
(APIServer pid=1229) Traceback (most recent call last):
(APIServer pid=1229) File "/usr/local/bin/vllm", line 10, in <module>
(APIServer pid=1229) sys.exit(main())
(APIServer pid=1229) ^^^^^^
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 73, in main
(APIServer pid=1229) args.dispatch_function(args)
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 60, in cmd
(APIServer pid=1229) uvloop.run(run_server(args))
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run
(APIServer pid=1229) return __asyncio.run(
(APIServer pid=1229) ^^^^^^^^^^^^^^
(APIServer pid=1229) File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run
(APIServer pid=1229) return runner.run(main)
(APIServer pid=1229) ^^^^^^^^^^^^^^^^
(APIServer pid=1229) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
(APIServer pid=1229) return self._loop.run_until_complete(task)
(APIServer pid=1229) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1229) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper
(APIServer pid=1229) return await main
(APIServer pid=1229) ^^^^^^^^^^
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1325, in run_server
(APIServer pid=1229) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1344, in run_server_worker
(APIServer pid=1229) async with build_async_engine_client(
(APIServer pid=1229) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=1229) return await anext(self.gen)
(APIServer pid=1229) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 171, in build_async_engine_client
(APIServer pid=1229) async with build_async_engine_client_from_engine_args(
(APIServer pid=1229) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__
(APIServer pid=1229) return await anext(self.gen)
(APIServer pid=1229) ^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 212, in build_async_engine_client_from_engine_args
(APIServer pid=1229) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=1229) ^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 207, in from_vllm_config
(APIServer pid=1229) return cls(
(APIServer pid=1229) ^^^^
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 134, in __init__
(APIServer pid=1229) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=1229) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 122, in make_async_mp_client
(APIServer pid=1229) return AsyncMPClient(*client_args)
(APIServer pid=1229) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 824, in __init__
(APIServer pid=1229) super().__init__(
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 479, in __init__
(APIServer pid=1229) with launch_core_engines(vllm_config, executor_class, log_stats) as (
(APIServer pid=1229) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__
(APIServer pid=1229) next(self.gen)
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 921, in launch_core_engines
(APIServer pid=1229) wait_for_engine_startup(
(APIServer pid=1229) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 980, in wait_for_engine_startup
(APIServer pid=1229) raise RuntimeError(
(APIServer pid=1229) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}