Looking to run Qwen3-VL-4B with the Orin Nano.
Anyone get it running?
I created a new conda:
I installed torch, torchvision, torchaudio, from https://pypi.jetson-ai-lab.io/jp6/cu126
Installed latest transformers with pip install git+https://github.com/huggingface/transformers --index-url https://pypi.jetson-ai-lab.io/jp6/cu126
There’s an unsloth version that’s 4-bit Quantized, but I haven’t been able to get that working either.
unsloth/Qwen3-VL-4B-Instruct-unsloth-bnb-4bit · Hugging Face
Any ideas would be appreciated! ❤️
-- coding: utf-8 --
import torch
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor
from vllm import LLM, SamplingParams
import os
os.environ[‘VLLM_WORKER_MULTIPROC_METHOD’] = ‘spawn’
def prepare_inputs_for_vllm(messages, processor):
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# qwen_vl_utils 0.0.14+ reqired
image_inputs, video_inputs, video_kwargs = process_vision_info(
messages,
image_patch_size=processor.image_processor.patch_size,
return_video_kwargs=True,
return_video_metadata=True
)
print(f"video_kwargs: {video_kwargs}")
mm_data = {}
if image_inputs is not None:
mm_data['image'] = image_inputs
if video_inputs is not None:
mm_data['video'] = video_inputs
return {
'prompt': text,
'multi_modal_data': mm_data,
'mm_processor_kwargs': video_kwargs
}
if name == ‘main’:
# messages = [
# {
# “role”: “user”,
# “content”: [
# {
# “type”: “video”,
# “video”: “https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-VL/space_woaudio.mp4”,
# },
# {“type”: “text”, “text”: “这段视频有多长”},
# ],
# }
# ]
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://qianwen-res.oss-accelerate.aliyuncs.com/Qwen3-VL/receipt.png",
},
{"type": "text", "text": "Read all the text in the image."},
],
}
]
# TODO: change to your own checkpoint path
checkpoint_path = "Qwen/Qwen3-VL-4B-Instruct-FP8"
processor = AutoProcessor.from_pretrained(checkpoint_path)
inputs = [prepare_inputs_for_vllm(message, processor) for message in [messages]]
llm = LLM(
model=checkpoint_path,
trust_remote_code=True,
gpu_memory_utilization=0.70,
enforce_eager=False,
tensor_parallel_size=torch.cuda.device_count(),
seed=0
)
sampling_params = SamplingParams(
temperature=0,
max_tokens=1024,
top_k=-1,
stop_token_ids=[],
)
for i, input_ in enumerate(inputs):
print()
print('=' * 40)
print(f"Inputs[{i}]: {input_['prompt']=!r}")
print('\n' + '>' * 40)
outputs = llm.generate(inputs, sampling_params=sampling_params)
for i, output in enumerate(outputs):
generated_text = output.outputs[0].text
print()
print('=' * 40)
print(f"Generated text: {generated_text!r}")