Use trt and pycuda to inference but cannot obatain true result in xviar NX

ccchen · August 28, 2022, 11:56am

I want to use pycuda and tensorrt to inference，but failed.
If I set times=1, I cannot get all result like ground truth. But if I set times=10 I can get correct results of the model.
I also test the same model (same onnx → tensort) in 2080 and can get correct result.
How can I get correct result by just inference once?

code like this:

import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import time


TRT_LOGGER = trt.Logger()
# from tensorrt.common
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.cpu = self.host = host_mem
        self.gpu = self.device = device_mem

    def __str__(self):
        return "Host: \n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


class TRTEngine:
    def __init__(self, engine_file_path, mode) -> None:
        with open(engine_file_path, "rb") as f, trt.Runtime(
            TRT_LOGGER
        ) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.mode = mode
        self.context = self.engine.create_execution_context()
        inputs, outputs, bindings, stream = self.allocate_buffers(self.engine)
        self.inputs = inputs
        self.outputs = outputs
        self.bindings = bindings
        self.stream = stream


    def allocate_buffers(self, engine):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for binding in engine:
            size = trt.volume(
                engine.get_binding_shape(binding)
            )  # * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # FIXME
            if self.mode == "async":
                # https://documen.tician.de/pycuda/driver.html?highlight=htod#pycuda.driver.memcpy_htod
                host_mem = cuda.pagelocked_empty(size, dtype)
            else:
                host_mem = np.zeros(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(device_mem))
            if engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
                print(size, dtype)
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))

        return inputs, outputs, bindings, stream

    def exec_sync(self, times):
        # cpu -> gpu
        [
            cuda.memcpy_htod(inp.device, inp.host)
            for inp in self.inputs
        ]
        for i in range(times):
            self.context.execute(
                batch_size=1,
                bindings=self.bindings,
            )
        # gpu -> cpu
        [
            cuda.memcpy_dtoh(out.host, out.device)
            for out in self.outputs
        ]
        return [out.host for out in self.outputs]

    # from tensorrt.common
    def exec_async(self, times=100):
        [
            cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
            for inp in self.inputs
        ]
        for i in range(times):
            self.context.execute_async_v2(
                bindings=self.bindings, stream_handle=self.stream.handle
            )
        [
            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
            for out in self.outputs
        ]
        self.stream.synchronize()
        return [out.host for out in self.outputs]

    def __call__(self, inputs, times=1):
        for i in range(len(inputs)):
            # self.inputs[i].host = np.ascontiguousarray(inputs[i])
            np.copyto(self.inputs[i].host, inputs[i].ravel().reshape(-1))
        if self.mode == "async":
            return self.exec_async(times)
        else:
            return self.exec_sync(times)


def transfer_pickle_cuda_to_numpy():
    import pickle5 as pickle
    for i in range(20):
        fname = f"/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.pickle"
        with open(fname, "rb") as fp:
            ss = pickle.load(fp)
        a = {}
        for k in ss.keys():
            if k == "matching":
                a[k] = ss[k]
            #continue
            else:
                a[k] = ss[k].cpu().numpy()
        with open(fname + "new", "wb") as fp:
            pickle.dump(a, fp)
            print("dump", i)

def compare_pth_trt(mode, times):
    import pickle5 as pickle
    from IPython import embed

    net_trt = TRTEngine(
        "./wheeljack_20220411_epoch_260_numsDT_20.trt",
        #("./transmot_numsDT20.trt")
        mode=mode
    )

    for i in range(20):
        with open(
            f"/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.picklenew", "rb"
        ) as f:
            data = pickle.load(f)
        keys = [
            "tracklets_norm_xyz",
            "tracklets_norm_xyzs",
            "detection_norm_xyz",
            "tracklets_motion",
            "out_mask",
        ]
        inputs = []
        for k in keys:
            inputs.append(data[k].astype("float32"))
            print(k, inputs[-1].shape, inputs[-1].dtype)
        _start = time.time()
        # inference start
        indices0 = net_trt(inputs, times=times)
        matching = {i: j for i, j in enumerate(indices0[0]) if j != -1}
        # inference end
        _end = time.time()
        _tot = 1000 * (_end - _start)

        print("diff:", matching, data["matching"])
        print(
            "finished {} times. total = {} ms, avg = {} ms, qps = {}.".format(
                "%d" % (times),
                "%.2f" % (_tot),
                "%.2f" % (_tot / times),
                "%.3f" % (1000 * times / _tot),
            )
        )


if __name__ == "__main__":
    # transfer_pickle_cuda_to_numpy()
    compare_pth_trt(mode="async", times=1)

# 1. 不论是sync or async，只有跑多次exec才能有正确结果
# 2. 在ppl中跑的时候报错：
#       [TensorRT] ERROR: 1: [slice.cu::launchNaiveSliceImpl::148] Error Code 1: Cuda Runtime (invalid resource handle)
#       https://stackoverflow.com/questions/58369731/adding-multiple-inference-on-tensorrt-invalid-resource-handle-error

ccchen · August 28, 2022, 3:28pm

And I am try use tensor in cuda and get data_ptr() to inference, but also failed

def exec(self, inputs, out, times=1):
        """
        inputs: tensor in cuda
        out: tensor in cuda
        """
        for i in range(times):
            self.context.execute_async_v2(
                bindings=[
                    inputs[0].data_ptr(),
                    inputs[1].data_ptr(),
                    inputs[2].data_ptr(),
                    inputs[3].data_ptr(),
                    inputs[4].data_ptr(),
                    out.data_ptr(),
                ],
                stream_handle=torch.cuda.current_stream().cuda_stream
            )
        # [
        #     cuda.memcpy_dtoh_async(out.host, out.device, torch.cuda.current_stream())
        #     for out in self.outputs
        # ]
        torch.cuda.current_stream().synchronize()
        return [out.cpu().numpy()]

log:

[TensorRT] ERROR: 1: [hardwareContext.cpp::configure::92] Error Code 1: Cudnn (CUDNN_STATUS_MAPPING_ERROR)

AastaLLL · August 29, 2022, 2:53am

Hi,

May I know which JetPack you use?

We want to reproduce this issue in our environment as well.
Could you also share the data and model with us?

/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.picklenew
/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.pickle
wheeljack_20220411_epoch_260_numsDT_20.trt # the original model, ex *.onnx

Thanks.

ccchen · August 29, 2022, 3:24am

my JETPACK

 - NVIDIA Jetson Xavier NX (Developer Kit Version)
   * Jetpack 4.6 [L4T 32.6.1]
   * NV Power Mode: MODE_15W_6CORE - Type: 2
   * jetson_stats.service: active
 - Libraries:
   * CUDA: 10.2.300
   * cuDNN: 8.2.1.32
   * TensorRT: 8.0.1.6
   * Visionworks: 1.6.0.501
   * OpenCV: 4.5.4 compiled CUDA: YES
   * VPI: ii libnvvpi1 1.1.15 arm64 NVIDIA Vision Programming Interface library
   * Vulkan: 1.2.70

here is my files. In face, batch_data_for_transmot_{i}.pickle don’t need. Thanks

transmot_numsDT20.trt (40.2 MB)
transmot_numsDT20.onnx (14.0 MB)
batch_data_for_transmot_10.picklenew (9.7 KB)

AastaLLL · August 29, 2022, 9:48am

Hi,

We try your source but don’t understand the output.
Could you share some details with us?

[default]

tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 12.54 ms, avg = 12.54 ms, qps = 79.758.
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 12.52 ms, avg = 12.52 ms, qps = 79.870.

[times=10]

compare_pth_trt(mode="async", times=10)

tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 10 times. total = 126.61 ms, avg = 12.66 ms, qps = 78.980.
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 10 times. total = 123.98 ms, avg = 12.40 ms, qps = 80.659.

Thanks.

ccchen · August 29, 2022, 9:53am

This output is pred and gt. The pred should be as same as gt. It seems that you don’t have the problem. Did you run the scripts in NX?
I have try two NX board and have same result.
And you trt model seems be faster.

here is my log:
when times= 1

420 <class 'numpy.float32'>
1000 <class 'numpy.float32'>
420 <class 'numpy.float32'>
80 <class 'numpy.float32'>
400 <class 'numpy.float32'>
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {4: 5} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 82.35 ms, avg = 82.35 ms, qps = 12.143.

when times=10

420 <class 'numpy.float32'>
1000 <class 'numpy.float32'>
420 <class 'numpy.float32'>
80 <class 'numpy.float32'>
400 <class 'numpy.float32'>
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 10 times. total = 261.57 ms, avg = 26.16 ms, qps = 38.230.

AastaLLL · August 30, 2022, 3:48am

Hi,

We try this again on XavierNX with JetPack 4.6.2.
The performance is still higher compared to your result.

tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 14.93 ms, avg = 14.93 ms, qps = 66.967.

May I know which power mode you use?
Could you give JetPack 4.6.2 a try?

Thanks.

ccchen · August 30, 2022, 4:12am

my power mode in jtop:

And my jetpack is 4.6.1, I will try 4.6.2

ccchen · August 30, 2022, 10:32am

It seems I can get right result in JetPACK 4.6.2 when times = 1

420 <class 'numpy.float32'>
1000 <class 'numpy.float32'>
420 <class 'numpy.float32'>
80 <class 'numpy.float32'>
400 <class 'numpy.float32'>
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 73.55 ms, avg = 73.55 ms, qps = 13.597.

AastaLLL · August 31, 2022, 5:24am

Good to know this.
Does JetPack 4.6.2 meet your requirement?

ccchen · August 31, 2022, 11:25am

yes, thanks!

system · September 21, 2022, 5:01am

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Topic		Replies	Views
Run model engine on Jetson NX win Python Jetson Xavier NX tensorflow , python	8	2443	October 17, 2021
Different FP16 inference with tensorrt and pytorch TensorRT	5	4492	October 25, 2021
Inswapper onnx model conversion to tensorrt model Jetson AGX Orin tensorrt , onnx	29	1000	January 8, 2025
DeepStream 5.1, PyTorch, MobileNet SSD v1, retained, ONNX - poor performance DeepStream SDK	8	1725	October 12, 2021
How can we know we have convert the onnx to int8trt rather than Float32? TensorRT tensorrt	23	1886	June 14, 2021
Pose estimation using TRT (trt_pose) - slightly lower framerates than stated in inference Jetson Nano tensorrt	12	3695	October 15, 2021
TensorRT ERROR: pointWiseV2Helpers.h::launchPwgenKernel::532 Cuda Driver (invalid resource handle) Jetson Xavier NX tensorrt , cuda , jetson-inference	3	2059	March 24, 2022
Onnx to TensorRT mismatch Jetson Orin NX tensorrt , cuda , cudnn , onnx	11	1009	January 15, 2024
ERORR with ONNX2TRT : Unknown embedded device detected Jetson Xavier NX onnx	18	4584	April 27, 2022
Lack of FPS after successfully deploy TLT to Deepstream. DeepStream SDK	18	1005	April 27, 2020

Use trt and pycuda to inference but cannot obatain true result in xviar NX

Related topics