Model Convert TensorRT and Predict on Jetson AGX Orin

Hello ;

I am using jetson AGX Orin 64Gb and predict DRRNet model (https://github.com/Chenghao-Tan/DDRNet) and I dont know how to convert this model tensorrt and predict .

Can you help me !

Dear @ahmet.gumustas,
You can convert the .pth model to ONNX and then you can use trtexec tool to evaluate your ONNX model.

Also, please see

Hello @SivaRamaKrishnaNV ;

I try some methods. This is the my code below here :

import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import cv2

# TensorRT logger
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)  # Daha fazla detay için VERBOSE kullanabilirsiniz

# Model dosyasını yükleme
engine_file_path = "/home/ar/workspace/ai_obstacle/DDRNet/mIoU_0_fp16.9042.engine"


# Engine yükleme fonksiyonu
def load_engine(engine_file_path):
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
        print("Engine loaded successfully.")
        return engine


# Context oluşturma ve buffer tahsisi
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    output_bindings = []  # Çıktı binding isimlerini saklamak için
    stream = cuda.Stream()

    for binding in engine:
        binding_shape = engine.get_tensor_shape(binding)
        size = trt.volume(binding_shape)
        dtype = trt.nptype(engine.get_tensor_dtype(binding))
        
        print(f"Binding: {binding}, Shape: {binding_shape}, DType: {dtype}, Size: {size}")

        # Host ve Device için bellek tahsisi
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)

        bindings.append(int(device_mem))

        if engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
            inputs.append((host_mem, device_mem))
        else:
            outputs.append((host_mem, device_mem))
            output_bindings.append(binding)

    print(f"Allocated {len(inputs)} input buffers and {len(outputs)} output buffers.")
    return inputs, outputs, bindings, stream, output_bindings


# Modeli çalıştırma fonksiyonu
def infer(engine, inputs, outputs, bindings, stream, input_data_rgb, input_data_depth, output_bindings):
    with engine.create_execution_context() as context:
        # RGB giriş verisini host buffer'ına kopyala
        np.copyto(inputs[0][0], input_data_rgb.ravel())
        print("RGB input data copied to host buffer.")

        # Depth giriş verisini host buffer'ına kopyala
        np.copyto(inputs[1][0], input_data_depth.ravel())
        print("Depth input data copied to host buffer.")

        # RGB verisini device'a kopyala
        cuda.memcpy_htod_async(inputs[0][1], inputs[0][0], stream)
        print("RGB input data copied to device.")

        # Depth verisini device'a kopyala
        cuda.memcpy_htod_async(inputs[1][1], inputs[1][0], stream)
        print("Depth input data copied to device.")

        # Modeli çalıştır
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        print("Model inference executed.")

        # Çıkış verisini host'a kopyala
        for out in outputs:
            cuda.memcpy_dtoh_async(out[0], out[1], stream)
        print("Output data copied to host.")

        # Stream'i tamamlayın
        stream.synchronize()
        print("Stream synchronized.")

    # Çıkış binding'lerinin şekillerini al
    output_shapes = [engine.get_tensor_shape(binding) for binding in output_bindings]
    # Çıkışları yeniden şekillendir
    reshaped_outputs = [out[0].reshape(shape) for out, shape in zip(outputs, output_shapes)]
    
    return reshaped_outputs


engine = load_engine(engine_file_path)

inputs, outputs, bindings, stream, output_bindings = allocate_buffers(engine)

cap = cv2.VideoCapture("/home/ar/workspace/ai_obstacle/DDRNet/data/USV_VIDEO.mp4")  # Videoyu başlat
frame_count = 0
while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_count += 1
    print(f"\nProcessing frame {frame_count}")

    input_data_rgb = cv2.resize(frame, (640, 360)).astype(np.float32) / 255.0
    input_data_rgb = input_data_rgb.transpose(2, 0, 1)  # HWC -> CHW
    input_data_rgb = np.expand_dims(input_data_rgb, axis=0)  # Batch boyutu ekle

   
    input_data_depth = np.random.rand(1, 1, 360, 1280).astype(np.float32)  # Gerçek derinlik verisi ile değiştirin

    # Giriş verisinin şeklini ve bazı değerlerini yazdır
    print("RGB Input data shape:", input_data_rgb.shape)
    print("RGB Input data sample (first channel):", input_data_rgb[0, 0, :5, :5])
    print("Depth Input data shape:", input_data_depth.shape)
    print("Depth Input data sample:", input_data_depth[0, 0, :5, :5])

    # Modeli çalıştır ve sonucu al
    results = infer(engine, inputs, outputs, bindings, stream, input_data_rgb, input_data_depth, output_bindings)

    # Sonuçları işle (örneğin, çıktı maskelerini görselleştir)
    for i, result in enumerate(results):
        print(f"Output {i} shape: {result.shape}")
        print(f"Output {i} sample data:", result.flatten()[:10])

cap.release()
cv2.destroyAllWindows()
print("Inference completed.")

I use this code and gave me this output :

Processing frame 76
RGB Input data shape: (1, 3, 360, 640)
RGB Input data sample (first channel): [[0.96862745 0.9882353  0.9843137  0.9843137  0.9843137 ]
 [0.96862745 0.9882353  0.9843137  0.9843137  0.9843137 ]
 [0.96862745 0.9882353  0.9843137  0.9843137  0.9843137 ]
 [0.96862745 0.9882353  0.9843137  0.9843137  0.9843137 ]
 [0.972549   0.99215686 0.9882353  0.9882353  0.9882353 ]]
Depth Input data shape: (1, 1, 360, 1280)
Depth Input data sample: [[0.66846114 0.39051682 0.67108285 0.37366867 0.19029744]
 [0.5988981  0.50117624 0.45912832 0.828911   0.59848493]
 [0.56340754 0.42418364 0.86585414 0.0888955  0.33443424]
 [0.74872255 0.15506409 0.1488716  0.5714744  0.36497173]
 [0.9919525  0.8046277  0.89628744 0.7411659  0.41453266]]
[09/25/2024-15:59:49] [TRT] [V] Total per-runner device persistent memory is 1024
[09/25/2024-15:59:49] [TRT] [V] Total per-runner host persistent memory is 332416
[09/25/2024-15:59:49] [TRT] [V] Allocated activation device memory of size 7642112
[09/25/2024-15:59:49] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +8, now: CPU 0, GPU 18 (MiB)
[09/25/2024-15:59:49] [TRT] [V] CUDA lazy loading is enabled.
RGB input data copied to host buffer.
Depth input data copied to host buffer.
RGB input data copied to device.
Depth input data copied to device.
Model inference executed.
Output data copied to host.
Stream synchronized.
Output 0 shape: (50,)
Output 0 sample data: [ 0. -0.  0. -0.  0. -0.  0. -0.  0. -0.]

How to predict correctly ? I may have written incorrectly and not expressed the subject properly, but these are the general lines.

Dear @ahmet.gumustas,
Could you double check if the preprocessing(post processing) is done correctly before feeding data to TRT model. Make sure you give same input to both TRT model and ONNX model for comparison.

1 Like

Hello @SivaRamaKrishnaNV

Thank You for your reply. I am trying everything but I am failed . Maybe I try This repostory https://github.com/NVIDIA-AI-IOT/torch2trt/tree/master. But this repo okay for this model ? What is your thought ?

Dear @SivaRamaKrishnaNV
I solve . Down the Jetpack Version 6.3 . Because TensorRT version 8.6.2 . This is much better pretrained models .

Thank You!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.