Tensorrt Inference Issue

Hi everyone,

I hope someone can help me with this issue I’m facing while trying to use TensorRT with my YOLOv5 model on my NVIDIA Jetson Orin Nano.

Initially, I trained a YOLOv5 model, and it worked well, but it was running slowly. To improve performance, I decided to convert the YOLOv5 model to TensorRT. I used the following code to convert it:

import torch
import tensorrt as trt
import onnx
import os
import sys

# Adding YOLOv5 folder to the system path
sys.path.append('/home/onur/Desktop/projects/denemeV2/yolov5')
from yolov5.models.common import DetectMultiBackend

# PyTorch model path
model_path = "/home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.pt"
onnx_path = model_path.replace(".pt", ".onnx")
tensorrt_path = model_path.replace(".pt", ".engine")

# Export PyTorch model to ONNX
def export_to_onnx(model_path, onnx_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = DetectMultiBackend(model_path, device=device)  # Fully load the model
    model.eval()
    dummy_input = torch.randn(1, 3, 640, 640, device=device)
    torch.onnx.export(
        model,
        dummy_input,
        onnx_path,
        opset_version=11,
        input_names=['input'],
        output_names=['output'],
        dynamic_axes={'input': {0: 'batch_size'}, 'output': {0: 'batch_size'}}  # Dynamic input and output sizes
    )

# Convert ONNX model to TensorRT
def onnx_to_tensorrt(onnx_path, trt_path):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, TRT_LOGGER)

    with open(onnx_path, 'rb') as model:
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse ONNX model')
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return

    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # Memory setting

    # Create optimization profile for dynamic shapes
    profile = builder.create_optimization_profile()
    input_name = network.get_input(0).name
    input_shape = (1, 3, 640, 640)
    profile.set_shape(input_name, min=input_shape, opt=input_shape, max=input_shape)
    config.add_optimization_profile(profile)

    # Use FP32, FP16 is not required.
    serialized_engine = builder.build_serialized_network(network, config)
    if serialized_engine is None:
        print('ERROR: Failed to build the engine')
        return

    # Save TensorRT engine
    with open(trt_path, 'wb') as f:
        f.write(serialized_engine)

# Run conversion steps
if __name__ == "__main__":
    # Step 1: Export to ONNX
    if not os.path.exists(onnx_path):
        export_to_onnx(model_path, onnx_path)
        print(f"ONNX model saved to {onnx_path}")

    # Step 2: Convert to TensorRT
    if os.path.exists(onnx_path):
        onnx_to_tensorrt(onnx_path, tensorrt_path)
        print(f"TensorRT engine saved to {tensorrt_path}")

The output indicated that the .engine file was created successfully:

onur@ubuntu:~/Desktop/projects$ /bin/python /home/onur/Desktop/projects/denemeV2/convert.py
Fusing layers...
Model summary: 157 layers, 7020913 parameters, 0 gradients, 15.8 GFLOPs
/home/onur/Desktop/projects/denemeV2/yolov5/models/common.py:688: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  y = self.model(im, augment=augment, visualize=visualize) if augment or visualize else self.model(im)
/home/onur/Desktop/projects/denemeV2/yolov5/models/yolo.py:101: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
  if self.dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
ONNX model saved to /home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.onnx
[11/18/2024-19:14:22] [TRT] [W] onnx2trt_utils.cpp:372: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
TensorRT engine saved to /home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.engine

Then, I tried to use this TensorRT engine in the following Python code for live video inference:

import cv2
import time
import numpy as np
import pycuda.driver as cuda  # CUDA kullanımı için gerekli
import pycuda.autoinit
import tensorrt as trt

# TensorRT Logger
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# TensorRT motor dosyasının yolu
engine_path = '/home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.engine'

# TensorRT motorunu yükleme
with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime:
    engine = runtime.deserialize_cuda_engine(f.read())

# TensorRT context oluşturma
context = engine.create_execution_context()

# Girdi ve çıktı için CUDA bellek ayırma
input_shape = (1, 3, 640, 640)  # Modelin girdi boyutları
output_shape = (1, 25200, 85)  # Modelin çıktı boyutları (YOLOv5 çıktısı için)

# CUDA belleği
d_input = cuda.mem_alloc(trt.volume(input_shape) * np.float32().nbytes)
d_output = cuda.mem_alloc(trt.volume(output_shape) * np.float32().nbytes)
bindings = [int(d_input), int(d_output)]
stream = cuda.Stream()

# GStreamer pipeline tanımlama
def gstreamer_pipeline(
    sensor_id=0,
    capture_width=1280,  # Resolution: 3280 x 2464 FR = 21, 3280 x 1848 FR = 28, 1920 x 1080 FR = 29, 1640 x 1232 FR = 29, 1280 x 720 FR = 59
    capture_height=720,
    display_width=960,
    display_height=540,
    framerate=30,
    flip_method=6,
):
    return (
        "nvarguscamerasrc sensor-id=%d ! "
        "video/x-raw(memory:NVMM), width=(int)%d, height=(int)%d, framerate=(fraction)%d/1 ! "
        "nvvidconv flip-method=%d ! "
        "video/x-raw, width=(int)%d, height=(int)%d, format=(string)BGRx ! "
        "videoconvert ! "
        "video/x-raw, format=(string)BGR ! appsink"
        % (
            sensor_id,
            capture_width,
            capture_height,
            framerate,
            flip_method,
            display_width,
            display_height,
        )
    )

# Canlı video yakalama
cap = cv2.VideoCapture(gstreamer_pipeline(flip_method=6), cv2.CAP_GSTREAMER)
prev_frame_time = 0
new_frame_time = 0
prev_infer_time = time.time()
infer_interval = 0.5  # Modelin yarı saniye aralıkla çalışması için

if not cap.isOpened():
    print("Error: Unable to open camera")
    exit()

while cap.isOpened():
    ret, frame = cap.read()
    
    if not ret:
        print("Error: Unable to read frame from camera.")
        break
    
    current_time = time.time()
    # Model tahminini belirli bir aralıkla yap
    if current_time - prev_infer_time > infer_interval:
        # Girdi verisinin TensorRT formatında hazırlanması
        img_resized = cv2.resize(frame, (640, 640))
        img_transposed = np.transpose(img_resized, (2, 0, 1)).astype(np.float32) / 255.0
        img_input = np.ascontiguousarray(img_transposed[np.newaxis, ...])
        
        # Girdi verisini CUDA'ya kopyalama
        cuda.memcpy_htod_async(d_input, img_input, stream)
        # Modeli çalıştırma
        context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
        # Çıktı verisini CUDA'dan alma
        output = np.empty(output_shape, dtype=np.float32)
        cuda.memcpy_dtoh_async(output, d_output, stream)
        stream.synchronize()

        # Tahmin yapıldıktan sonra `prev_infer_time`ı güncelle
        prev_infer_time = current_time

    # FPS hesaplama
    new_frame_time = time.time()
    fps = 1 / (new_frame_time - prev_frame_time)
    prev_frame_time = new_frame_time
    
    # Virgüllü formatta FPS değeri
    fps_text = "FPS: {:.2f}".format(fps)
    
    # FPS değerini ekrana yazdırma
    frame_with_fps = frame.copy()  # Orijinal kareyi koruyarak üstüne yaz
    cv2.putText(frame_with_fps, fps_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
    
    # Ekranda görüntüyü göster
    cv2.imshow('TensorRT Inference', frame_with_fps)
    
    if cv2.waitKey(10) & 0xFF == ord('q'):
        break

cap.release()
cuda.mem_free(d_input)
cuda.mem_free(d_output)
cuda.Context.pop()
cv2.destroyAllWindows()

When I ran the script, the camera opened briefly and then immediately closed, and I received the following warnings and errors:

onur@ubuntu:~/Desktop/projects$ /bin/python /home/onur/Desktop/projects/denemeV2/5.3.3live_video_with_trained_model.py
GST_ARGUS: Creating output stream
...
[11/18/2024-19:30:37] [TRT] [E] 3: [executionContext.cpp::enqueueInternal::816] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::enqueueInternal::816, condition: bindings[x] || nullBindingOK)
[11/18/2024-19:30:37] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
Traceback (most recent call last):
  File "/home/onur/Desktop/projects/denemeV2/5.3.3live_video_with_trained_model.py", line 91, in <module>
    cuda.memcpy_dtoh_async(output, d_output, stream)
pycuda._driver.LogicError: cuMemcpyDtoHAsync failed: an illegal memory access was encountered
...

All Errors:

onur@ubuntu:~/Desktop/projects$ /bin/python /home/onur/Desktop/projects/denemeV2/5.3.3live_video_with_trained_model.py
GST_ARGUS: Creating output stream
CONSUMER: Waiting until producer is connected...
GST_ARGUS: Available Sensor modes :
GST_ARGUS: 3280 x 2464 FR = 21.000000 fps Duration = 47619048 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 3280 x 1848 FR = 28.000001 fps Duration = 35714284 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1920 x 1080 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1640 x 1232 FR = 29.999999 fps Duration = 33333334 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: 1280 x 720 FR = 59.999999 fps Duration = 16666667 ; Analog Gain range min 1.000000, max 10.625000; Exposure Range min 13000, max 683709000;

GST_ARGUS: Running with following settings:
   Camera index = 0 
   Camera mode  = 4 
   Output Stream W = 1280 H = 720 
   seconds to Run    = 0 
   Frame Rate = 59.999999 
GST_ARGUS: Setup Complete, Starting captures for 0 seconds
GST_ARGUS: Starting repeat capture requests.
CONSUMER: Producer has connected; continuing.
[ WARN:0@1.056] global cap_gstreamer.cpp:1777 open OpenCV | GStreamer warning: Cannot query video position: status=0, value=-1, duration=-1
Gtk-Message: 19:30:36.879: Failed to load module "canberra-gtk-module"
[11/18/2024-19:30:37] [TRT] [E] 3: [executionContext.cpp::enqueueInternal::816] Error Code 3: API Usage Error (Parameter check failed at: runtime/api/executionContext.cpp::enqueueInternal::816, condition: bindings[x] || nullBindingOK
)
[11/18/2024-19:30:37] [TRT] [E] 1: [genericReformat.cuh::copyVectorizedRunKernel::1579] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
Traceback (most recent call last):
  File "/home/onur/Desktop/projects/denemeV2/5.3.3live_video_with_trained_model.py", line 91, in <module>
    cuda.memcpy_dtoh_async(output, d_output, stream)
pycuda._driver.LogicError: cuMemcpyDtoHAsync failed: an illegal memory access was encountered
[11/18/2024-19:30:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::61] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::61] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaStream::47] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
...
untime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaResources.cpp::~ScopedCudaEvent::24] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [defaultAllocator.cpp::deallocate::61] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaDriverHelpers.cpp::operator()::94] Error Code 1: Cuda Driver (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaDriverHelpers.cpp::operator()::94] Error Code 1: Cuda Driver (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaDriverHelpers.cpp::operator()::94] Error Code 1: Cuda Driver (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaDriverHelpers.cpp::operator()::94] Error Code 1: Cuda Driver (an illegal memory access was encountered)
[11/18/2024-19:30:38] [TRT] [E] 1: [cudaDriverHelpers.cpp::operator()::94] Error Code 1: Cuda Driver (an illegal memory access was encountered)
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuMemFree failed: an illegal memory access was encountered
PyCUDA WARNING: a clean-up operation failed (dead context maybe?)
cuStreamDestroy failed: an illegal memory access was encountered
GST_ARGUS: Cleaning up
CONSUMER: Done Success
GST_ARGUS: Done Success

I am struggling to understand what went wrong here. The TensorRT engine was successfully built, but I can’t seem to run the inference properly. The camera works, but it quickly shuts down, and I receive multiple CUDA-related errors about illegal memory access.

Could anyone help me figure out where I might be going wrong and what I can do to resolve this issue?

Any guidance would be greatly appreciated. Thank you! @AastaLLL, @AakankshaS @EduardoSalazar96, @allan.navarro, @proventusnova

Dear @onurrcifcii,
Can you quickly test the model with trtexec tool to see if inference can be performed with dummy data?

Hello @SivaRamaKrishnaNV,

Here is the output;

onur@ubuntu:~$ /usr/src/tensorrt/bin/trtexec --loadEngine=/home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.engine --verbose
&&&& RUNNING TensorRT.trtexec [TensorRT v8602] # /usr/src/tensorrt/bin/trtexec --loadEngine=/home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.engine --verbose
[11/19/2024-09:20:20] [I] === Model Options ===
[11/19/2024-09:20:20] [I] Format: *
[11/19/2024-09:20:20] [I] Model: 
[11/19/2024-09:20:20] [I] Output:
[11/19/2024-09:20:20] [I] === Build Options ===
[11/19/2024-09:20:20] [I] Max batch: 1
[11/19/2024-09:20:20] [I] Memory Pools: workspace: default, dlaSRAM: default, dlaLocalDRAM: default, dlaGlobalDRAM: default
[11/19/2024-09:20:20] [I] minTiming: 1
[11/19/2024-09:20:20] [I] avgTiming: 8
[11/19/2024-09:20:20] [I] Precision: FP32
[11/19/2024-09:20:20] [I] LayerPrecisions: 
[11/19/2024-09:20:20] [I] Layer Device Types: 
[11/19/2024-09:20:20] [I] Calibration: 
[11/19/2024-09:20:20] [I] Refit: Disabled
[11/19/2024-09:20:20] [I] Version Compatible: Disabled
[11/19/2024-09:20:20] [I] ONNX Native InstanceNorm: Disabled
[11/19/2024-09:20:20] [I] TensorRT runtime: full
[11/19/2024-09:20:20] [I] Lean DLL Path: 
[11/19/2024-09:20:20] [I] Tempfile Controls: { in_memory: allow, temporary: allow }
[11/19/2024-09:20:20] [I] Exclude Lean Runtime: Disabled
[11/19/2024-09:20:20] [I] Sparsity: Disabled
[11/19/2024-09:20:20] [I] Safe mode: Disabled
[11/19/2024-09:20:20] [I] Build DLA standalone loadable: Disabled
[11/19/2024-09:20:20] [I] Allow GPU fallback for DLA: Disabled
[11/19/2024-09:20:20] [I] DirectIO mode: Disabled
[11/19/2024-09:20:20] [I] Restricted mode: Disabled
[11/19/2024-09:20:20] [I] Skip inference: Disabled
[11/19/2024-09:20:20] [I] Save engine: 
[11/19/2024-09:20:20] [I] Load engine: /home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.engine
[11/19/2024-09:20:20] [I] Profiling verbosity: 0
[11/19/2024-09:20:20] [I] Tactic sources: Using default tactic sources
[11/19/2024-09:20:20] [I] timingCacheMode: local
[11/19/2024-09:20:20] [I] timingCacheFile: 
[11/19/2024-09:20:20] [I] Heuristic: Disabled
[11/19/2024-09:20:20] [I] Preview Features: Use default preview flags.
[11/19/2024-09:20:20] [I] MaxAuxStreams: -1
[11/19/2024-09:20:20] [I] BuilderOptimizationLevel: -1
[11/19/2024-09:20:20] [I] Input(s)s format: fp32:CHW
[11/19/2024-09:20:20] [I] Output(s)s format: fp32:CHW
[11/19/2024-09:20:20] [I] Input build shapes: model
[11/19/2024-09:20:20] [I] Input calibration shapes: model
[11/19/2024-09:20:20] [I] === System Options ===
[11/19/2024-09:20:20] [I] Device: 0
[11/19/2024-09:20:20] [I] DLACore: 
[11/19/2024-09:20:20] [I] Plugins:
[11/19/2024-09:20:20] [I] setPluginsToSerialize:
[11/19/2024-09:20:20] [I] dynamicPlugins:
[11/19/2024-09:20:20] [I] ignoreParsedPluginLibs: 0
[11/19/2024-09:20:20] [I] 
[11/19/2024-09:20:20] [I] === Inference Options ===
[11/19/2024-09:20:20] [I] Batch: 1
[11/19/2024-09:20:20] [I] Input inference shapes: model
[11/19/2024-09:20:20] [I] Iterations: 10
[11/19/2024-09:20:20] [I] Duration: 3s (+ 200ms warm up)
[11/19/2024-09:20:20] [I] Sleep time: 0ms
[11/19/2024-09:20:20] [I] Idle time: 0ms
[11/19/2024-09:20:20] [I] Inference Streams: 1
[11/19/2024-09:20:20] [I] ExposeDMA: Disabled
[11/19/2024-09:20:20] [I] Data transfers: Enabled
[11/19/2024-09:20:20] [I] Spin-wait: Disabled
[11/19/2024-09:20:20] [I] Multithreading: Disabled
[11/19/2024-09:20:20] [I] CUDA Graph: Disabled
[11/19/2024-09:20:20] [I] Separate profiling: Disabled
[11/19/2024-09:20:20] [I] Time Deserialize: Disabled
[11/19/2024-09:20:20] [I] Time Refit: Disabled
[11/19/2024-09:20:20] [I] NVTX verbosity: 0
[11/19/2024-09:20:20] [I] Persistent Cache Ratio: 0
[11/19/2024-09:20:20] [I] Inputs:
[11/19/2024-09:20:20] [I] === Reporting Options ===
[11/19/2024-09:20:20] [I] Verbose: Enabled
[11/19/2024-09:20:20] [I] Averages: 10 inferences
[11/19/2024-09:20:20] [I] Percentiles: 90,95,99
[11/19/2024-09:20:20] [I] Dump refittable layers:Disabled
[11/19/2024-09:20:20] [I] Dump output: Disabled
[11/19/2024-09:20:20] [I] Profile: Disabled
[11/19/2024-09:20:20] [I] Export timing to JSON file: 
[11/19/2024-09:20:20] [I] Export output to JSON file: 
[11/19/2024-09:20:20] [I] Export profile to JSON file: 
[11/19/2024-09:20:20] [I] 
[11/19/2024-09:20:21] [I] === Device Information ===
[11/19/2024-09:20:21] [I] Selected Device: Orin
[11/19/2024-09:20:21] [I] Compute Capability: 8.7
[11/19/2024-09:20:21] [I] SMs: 8
[11/19/2024-09:20:21] [I] Device Global Memory: 7620 MiB
[11/19/2024-09:20:21] [I] Shared Memory per SM: 164 KiB
[11/19/2024-09:20:21] [I] Memory Bus Width: 128 bits (ECC disabled)
[11/19/2024-09:20:21] [I] Application Compute Clock Rate: 0.624 GHz
[11/19/2024-09:20:21] [I] Application Memory Clock Rate: 0.624 GHz
[11/19/2024-09:20:21] [I] 
[11/19/2024-09:20:21] [I] Note: The application clock rates do not reflect the actual clock rates that the GPU is currently running at.
[11/19/2024-09:20:21] [I] 
[11/19/2024-09:20:21] [I] TensorRT version: 8.6.2
[11/19/2024-09:20:21] [I] Loading standard plugins
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::BatchedNMSDynamic_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::BatchedNMS_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::BatchTilePlugin_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::Clip_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::CoordConvAC version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::CropAndResizeDynamic version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::CropAndResize version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::DecodeBbox3DPlugin version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::DetectionLayer_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::EfficientNMS_Explicit_TF_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::EfficientNMS_Implicit_TF_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::EfficientNMS_ONNX_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::EfficientNMS_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::FlattenConcat_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::GenerateDetection_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::GridAnchor_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::GridAnchorRect_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::InstanceNormalization_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::InstanceNormalization_TRT version 2
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::LReLU_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::ModulatedDeformConv2d version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::MultilevelCropAndResize_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::MultilevelProposeROI_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::MultiscaleDeformableAttnPlugin_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::NMSDynamic_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::NMS_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::Normalize_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::PillarScatterPlugin version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::PriorBox_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::ProposalDynamic version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::ProposalLayer_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::Proposal version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::PyramidROIAlign_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::Region_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::Reorg_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::ResizeNearest_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::ROIAlign_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::RPROI_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::ScatterND version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::SpecialSlice_TRT version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::Split version 1
[11/19/2024-09:20:21] [V] [TRT] Registered plugin creator - ::VoxelGeneratorPlugin version 1
[11/19/2024-09:20:21] [I] Engine loaded in 0.0417025 sec.
[11/19/2024-09:20:21] [I] [TRT] Loaded engine size: 30 MiB
[11/19/2024-09:20:21] [V] [TRT] Deserialization required 104632 microseconds.
[11/19/2024-09:20:21] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +26, now: CPU 0, GPU 26 (MiB)
[11/19/2024-09:20:21] [I] Engine deserialized in 0.140276 sec.
[11/19/2024-09:20:21] [V] [TRT] Total per-runner device persistent memory is 0
[11/19/2024-09:20:21] [V] [TRT] Total per-runner host persistent memory is 346928
[11/19/2024-09:20:21] [V] [TRT] Allocated activation device memory of size 34586112
[11/19/2024-09:20:21] [I] [TRT] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +33, now: CPU 0, GPU 59 (MiB)
[11/19/2024-09:20:21] [V] [TRT] CUDA lazy loading is enabled.
[11/19/2024-09:20:21] [I] Setting persistentCacheLimit to 0 bytes.
[11/19/2024-09:20:21] [V] Using enqueueV3.
[11/19/2024-09:20:21] [I] Using random values for input input
[11/19/2024-09:20:21] [I] Input binding for input with dimensions 1x3x640x640 is created.
[11/19/2024-09:20:21] [I] Output binding for onnx::Sigmoid_350 with dimensions 1x3x80x80x9 is created.
[11/19/2024-09:20:21] [I] Output binding for onnx::Sigmoid_454 with dimensions 1x3x40x40x9 is created.
[11/19/2024-09:20:21] [I] Output binding for onnx::Sigmoid_558 with dimensions 1x3x20x20x9 is created.
[11/19/2024-09:20:21] [I] Output binding for output with dimensions 1x25200x9 is created.
[11/19/2024-09:20:21] [I] Starting inference
[11/19/2024-09:20:24] [I] Warmup completed 6 queries over 200 ms
[11/19/2024-09:20:24] [I] Timing trace has 195 queries over 3.04397 s
[11/19/2024-09:20:24] [I] 
[11/19/2024-09:20:24] [I] === Trace details ===
[11/19/2024-09:20:24] [I] Trace averages of 10 runs:
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.2784 ms - Host latency: 15.8114 ms (enqueue 2.38609 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.3753 ms - Host latency: 15.9077 ms (enqueue 2.43387 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.9062 ms - Host latency: 16.4367 ms (enqueue 2.33519 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.3605 ms - Host latency: 15.893 ms (enqueue 2.50862 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.6005 ms - Host latency: 16.1419 ms (enqueue 2.51237 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.4761 ms - Host latency: 16.011 ms (enqueue 2.61516 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.7162 ms - Host latency: 16.2459 ms (enqueue 2.43772 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.5565 ms - Host latency: 16.0934 ms (enqueue 2.47794 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.4667 ms - Host latency: 15.9967 ms (enqueue 2.50048 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.3204 ms - Host latency: 15.8583 ms (enqueue 2.26499 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.8881 ms - Host latency: 16.4221 ms (enqueue 2.3869 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.338 ms - Host latency: 15.8693 ms (enqueue 2.20602 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.3054 ms - Host latency: 15.8356 ms (enqueue 2.27012 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.3805 ms - Host latency: 15.9134 ms (enqueue 2.2479 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.9583 ms - Host latency: 16.4995 ms (enqueue 2.39375 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.3768 ms - Host latency: 15.9072 ms (enqueue 2.33784 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.371 ms - Host latency: 15.9053 ms (enqueue 2.32856 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.8975 ms - Host latency: 16.4288 ms (enqueue 2.32102 ms)
[11/19/2024-09:20:24] [I] Average on 10 runs - GPU latency: 15.3916 ms - Host latency: 15.9242 ms (enqueue 2.23528 ms)
[11/19/2024-09:20:24] [I] 
[11/19/2024-09:20:24] [I] === Performance summary ===
[11/19/2024-09:20:24] [I] Throughput: 64.0611 qps
[11/19/2024-09:20:24] [I] Latency: min = 14.2336 ms, max = 20.0488 ms, mean = 16.059 ms, median = 16.0756 ms, percentile(90%) = 16.2903 ms, percentile(95%) = 16.4279 ms, percentile(99%) = 19.9612 ms
[11/19/2024-09:20:24] [I] Enqueue Time: min = 1.40015 ms, max = 2.94824 ms, mean = 2.37993 ms, median = 2.41153 ms, percentile(90%) = 2.59766 ms, percentile(95%) = 2.69373 ms, percentile(99%) = 2.93097 ms
[11/19/2024-09:20:24] [I] H2D Latency: min = 0.314453 ms, max = 0.42511 ms, mean = 0.355294 ms, median = 0.353729 ms, percentile(90%) = 0.365479 ms, percentile(95%) = 0.371338 ms, percentile(99%) = 0.40979 ms
[11/19/2024-09:20:24] [I] GPU Compute Time: min = 13.6963 ms, max = 19.4963 ms, mean = 15.5258 ms, median = 15.5381 ms, percentile(90%) = 15.7603 ms, percentile(95%) = 15.8994 ms, percentile(99%) = 19.4329 ms
[11/19/2024-09:20:24] [I] D2H Latency: min = 0.127197 ms, max = 0.194336 ms, mean = 0.177921 ms, median = 0.177643 ms, percentile(90%) = 0.182129 ms, percentile(95%) = 0.185547 ms, percentile(99%) = 0.193604 ms
[11/19/2024-09:20:24] [I] Total Host Walltime: 3.04397 s
[11/19/2024-09:20:24] [I] Total GPU Compute Time: 3.02753 s
[11/19/2024-09:20:24] [W] * GPU compute time is unstable, with coefficient of variance = 5.05568%.
[11/19/2024-09:20:24] [W]   If not already in use, locking GPU clock frequency or adding --useSpinWait may improve the stability.
[11/19/2024-09:20:24] [I] Explanations of the performance metrics are printed in the verbose logs.
[11/19/2024-09:20:24] [V] 
[11/19/2024-09:20:24] [V] === Explanations of the performance metrics ===
[11/19/2024-09:20:24] [V] Total Host Walltime: the host walltime from when the first query (after warmups) is enqueued to when the last query is completed.
[11/19/2024-09:20:24] [V] GPU Compute Time: the GPU latency to execute the kernels for a query.
[11/19/2024-09:20:24] [V] Total GPU Compute Time: the summation of the GPU Compute Time of all the queries. If this is significantly shorter than Total Host Walltime, the GPU may be under-utilized because of host-side overheads or data transfers.
[11/19/2024-09:20:24] [V] Throughput: the observed throughput computed by dividing the number of queries by the Total Host Walltime. If this is significantly lower than the reciprocal of GPU Compute Time, the GPU may be under-utilized because of host-side overheads or data transfers.
[11/19/2024-09:20:24] [V] Enqueue Time: the host latency to enqueue a query. If this is longer than GPU Compute Time, the GPU may be under-utilized.
[11/19/2024-09:20:24] [V] H2D Latency: the latency for host-to-device data transfers for input tensors of a single query.
[11/19/2024-09:20:24] [V] D2H Latency: the latency for device-to-host data transfers for output tensors of a single query.
[11/19/2024-09:20:24] [V] Latency: the summation of H2D Latency, GPU Compute Time, and D2H Latency. This is the latency to infer a single query.
[11/19/2024-09:20:24] [I] 
&&&& PASSED TensorRT.trtexec [TensorRT v8602] # /usr/src/tensorrt/bin/trtexec --loadEngine=/home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.engine --verbose

Any idea? @AastaLLL , @AakankshaS @EduardoSalazar96, @allan.navarro, @proventusnova

Hello @AastaLLL , @AakankshaS @EduardoSalazar96, @allan.navarro, @proventusnova

I am not sure about this code, can you suggest another code that can convert pt. (yolov5) file to .engine(TenserRT) file?

import torch
import tensorrt as trt
import onnx
import os
import sys
import cv2
import time
import numpy as np
import pycuda.driver as cuda  # CUDA kullanımı için gerekli
import pycuda.autoinit

# Yolov5'in bulunduğu klasörü yol olarak ekle
sys.path.append('/home/onur/Desktop/projects/denemeV2/yolov5')
from yolov5.models.common import DetectMultiBackend

# PyTorch model path
model_path = "/home/onur/Desktop/projects/denemeV2/yolov5/runs/train/exp4/weights/last.pt"
onnx_path = model_path.replace(".pt", ".onnx")
tensorrt_path = model_path.replace(".pt", ".engine")

# Export PyTorch model to ONNX
def export_to_onnx(model_path, onnx_path):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = DetectMultiBackend(model_path, device=device)  # Modeli tam olarak yükle
    model.eval()
    dummy_input = torch.randn(1, 3, 640, 640, device=device)
    torch.onnx.export(
        model,
        dummy_input,
        onnx_path,
        opset_version=11,
        input_names=['input'],
        output_names=['output'],
        dynamic_axes=None  # Dinamik giriş ve çıkış olmadan sabit batch boyutu kullanmak
    )

# Convert ONNX model to TensorRT
def onnx_to_tensorrt(onnx_path, trt_path):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, TRT_LOGGER)

    with open(onnx_path, 'rb') as model:
        if not parser.parse(model.read()):
            print('ERROR: Failed to parse ONNX model')
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return

    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # Bellek ayarı

    # Sabit batch boyutunu kullanarak optimizasyon profili oluştur
    profile = builder.create_optimization_profile()
    input_name = network.get_input(0).name
    input_shape = (1, 3, 640, 640)  # Sabit batch boyutu
    profile.set_shape(input_name, min=input_shape, opt=input_shape, max=input_shape)
    config.add_optimization_profile(profile)

    # FP32 varsayılan, FP16 kullanımı gerekmez.
    serialized_engine = builder.build_serialized_network(network, config)
    if serialized_engine is None:
        print('ERROR: Failed to build the engine')
        return

    # TensorRT motorunu kaydet
    with open(trt_path, 'wb') as f:
        f.write(serialized_engine)

# Run conversion steps
if __name__ == "__main__":
    # Step 1: Export to ONNX
    if not os.path.exists(onnx_path):
        export_to_onnx(model_path, onnx_path)
        print(f"ONNX model saved to {onnx_path}")

    # Step 2: Convert to TensorRT
    if os.path.exists(onnx_path):
        onnx_to_tensorrt(onnx_path, tensorrt_path)
        print(f"TensorRT engine saved to {tensorrt_path}")

Here is my pt file:
last.zip (12.5 MB)

Dear @onurrcifcii,

This indicate you are accessing unallocated memory
Per trtexec log the engine loading and able to perform inference with dummy data. The notice input and output blobs sizes are

[11/19/2024-09:20:21] [I] Input binding for input with dimensions 1x3x640x640 is created.
[11/19/2024-09:20:21] [I] Output binding for onnx::Sigmoid_350 with dimensions 1x3x80x80x9 is created.
[11/19/2024-09:20:21] [I] Output binding for onnx::Sigmoid_454 with dimensions 1x3x40x40x9 is created.
[11/19/2024-09:20:21] [I] Output binding for onnx::Sigmoid_558 with dimensions 1x3x20x20x9 is created.
[11/19/2024-09:20:21] [I] Output binding for output with dimensions 1x25200x9 is created.

Please double check the output buffer size.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.