Use trt and pycuda to inference but cannot obatain true result in xviar NX

I want to use pycuda and tensorrt to inference,but failed.
If I set times=1, I cannot get all result like ground truth. But if I set times=10 I can get correct results of the model.
I also test the same model (same onnx → tensort) in 2080 and can get correct result.
How can I get correct result by just inference once?

code like this:

import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
import time


TRT_LOGGER = trt.Logger()
# from tensorrt.common
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.cpu = self.host = host_mem
        self.gpu = self.device = device_mem

    def __str__(self):
        return "Host: \n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


class TRTEngine:
    def __init__(self, engine_file_path, mode) -> None:
        with open(engine_file_path, "rb") as f, trt.Runtime(
            TRT_LOGGER
        ) as runtime:
            self.engine = runtime.deserialize_cuda_engine(f.read())
        self.mode = mode
        self.context = self.engine.create_execution_context()
        inputs, outputs, bindings, stream = self.allocate_buffers(self.engine)
        self.inputs = inputs
        self.outputs = outputs
        self.bindings = bindings
        self.stream = stream


    def allocate_buffers(self, engine):
        inputs = []
        outputs = []
        bindings = []
        stream = cuda.Stream()
        for binding in engine:
            size = trt.volume(
                engine.get_binding_shape(binding)
            )  # * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # FIXME
            if self.mode == "async":
                # https://documen.tician.de/pycuda/driver.html?highlight=htod#pycuda.driver.memcpy_htod
                host_mem = cuda.pagelocked_empty(size, dtype)
            else:
                host_mem = np.zeros(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            bindings.append(int(device_mem))
            if engine.binding_is_input(binding):
                inputs.append(HostDeviceMem(host_mem, device_mem))
                print(size, dtype)
            else:
                outputs.append(HostDeviceMem(host_mem, device_mem))

        return inputs, outputs, bindings, stream

    def exec_sync(self, times):
        # cpu -> gpu
        [
            cuda.memcpy_htod(inp.device, inp.host)
            for inp in self.inputs
        ]
        for i in range(times):
            self.context.execute(
                batch_size=1,
                bindings=self.bindings,
            )
        # gpu -> cpu
        [
            cuda.memcpy_dtoh(out.host, out.device)
            for out in self.outputs
        ]
        return [out.host for out in self.outputs]

    # from tensorrt.common
    def exec_async(self, times=100):
        [
            cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
            for inp in self.inputs
        ]
        for i in range(times):
            self.context.execute_async_v2(
                bindings=self.bindings, stream_handle=self.stream.handle
            )
        [
            cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
            for out in self.outputs
        ]
        self.stream.synchronize()
        return [out.host for out in self.outputs]

    def __call__(self, inputs, times=1):
        for i in range(len(inputs)):
            # self.inputs[i].host = np.ascontiguousarray(inputs[i])
            np.copyto(self.inputs[i].host, inputs[i].ravel().reshape(-1))
        if self.mode == "async":
            return self.exec_async(times)
        else:
            return self.exec_sync(times)


def transfer_pickle_cuda_to_numpy():
    import pickle5 as pickle
    for i in range(20):
        fname = f"/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.pickle"
        with open(fname, "rb") as fp:
            ss = pickle.load(fp)
        a = {}
        for k in ss.keys():
            if k == "matching":
                a[k] = ss[k]
            #continue
            else:
                a[k] = ss[k].cpu().numpy()
        with open(fname + "new", "wb") as fp:
            pickle.dump(a, fp)
            print("dump", i)

def compare_pth_trt(mode, times):
    import pickle5 as pickle
    from IPython import embed

    net_trt = TRTEngine(
        "./wheeljack_20220411_epoch_260_numsDT_20.trt",
        #("./transmot_numsDT20.trt")
        mode=mode
    )

    for i in range(20):
        with open(
            f"/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.picklenew", "rb"
        ) as f:
            data = pickle.load(f)
        keys = [
            "tracklets_norm_xyz",
            "tracklets_norm_xyzs",
            "detection_norm_xyz",
            "tracklets_motion",
            "out_mask",
        ]
        inputs = []
        for k in keys:
            inputs.append(data[k].astype("float32"))
            print(k, inputs[-1].shape, inputs[-1].dtype)
        _start = time.time()
        # inference start
        indices0 = net_trt(inputs, times=times)
        matching = {i: j for i, j in enumerate(indices0[0]) if j != -1}
        # inference end
        _end = time.time()
        _tot = 1000 * (_end - _start)

        print("diff:", matching, data["matching"])
        print(
            "finished {} times. total = {} ms, avg = {} ms, qps = {}.".format(
                "%d" % (times),
                "%.2f" % (_tot),
                "%.2f" % (_tot / times),
                "%.3f" % (1000 * times / _tot),
            )
        )


if __name__ == "__main__":
    # transfer_pickle_cuda_to_numpy()
    compare_pth_trt(mode="async", times=1)

# 1. 不论是sync or async,只有跑多次exec才能有正确结果
# 2. 在ppl中跑的时候报错:
#       [TensorRT] ERROR: 1: [slice.cu::launchNaiveSliceImpl::148] Error Code 1: Cuda Runtime (invalid resource handle)
#       https://stackoverflow.com/questions/58369731/adding-multiple-inference-on-tensorrt-invalid-resource-handle-error

And I am try use tensor in cuda and get data_ptr() to inference, but also failed

def exec(self, inputs, out, times=1):
        """
        inputs: tensor in cuda
        out: tensor in cuda
        """
        for i in range(times):
            self.context.execute_async_v2(
                bindings=[
                    inputs[0].data_ptr(),
                    inputs[1].data_ptr(),
                    inputs[2].data_ptr(),
                    inputs[3].data_ptr(),
                    inputs[4].data_ptr(),
                    out.data_ptr(),
                ],
                stream_handle=torch.cuda.current_stream().cuda_stream
            )
        # [
        #     cuda.memcpy_dtoh_async(out.host, out.device, torch.cuda.current_stream())
        #     for out in self.outputs
        # ]
        torch.cuda.current_stream().synchronize()
        return [out.cpu().numpy()]

log:

[TensorRT] ERROR: 1: [hardwareContext.cpp::configure::92] Error Code 1: Cudnn (CUDNN_STATUS_MAPPING_ERROR)

Hi,

May I know which JetPack you use?

We want to reproduce this issue in our environment as well.
Could you also share the data and model with us?

/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.picklenew
/home/dev/gz/trt_test/transmot_batch_data_numsDT_20/batch_data_for_transmot_{i}.pickle
wheeljack_20220411_epoch_260_numsDT_20.trt # the original model, ex *.onnx

Thanks.

my JETPACK

 - NVIDIA Jetson Xavier NX (Developer Kit Version)
   * Jetpack 4.6 [L4T 32.6.1]
   * NV Power Mode: MODE_15W_6CORE - Type: 2
   * jetson_stats.service: active
 - Libraries:
   * CUDA: 10.2.300
   * cuDNN: 8.2.1.32
   * TensorRT: 8.0.1.6
   * Visionworks: 1.6.0.501
   * OpenCV: 4.5.4 compiled CUDA: YES
   * VPI: ii libnvvpi1 1.1.15 arm64 NVIDIA Vision Programming Interface library
   * Vulkan: 1.2.70

here is my files. In face, batch_data_for_transmot_{i}.pickle don’t need. Thanks

transmot_numsDT20.trt (40.2 MB)
transmot_numsDT20.onnx (14.0 MB)
batch_data_for_transmot_10.picklenew (9.7 KB)

Hi,

We try your source but don’t understand the output.
Could you share some details with us?

[default]

tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 12.54 ms, avg = 12.54 ms, qps = 79.758.
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 12.52 ms, avg = 12.52 ms, qps = 79.870.

[times=10]

compare_pth_trt(mode="async", times=10)
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 10 times. total = 126.61 ms, avg = 12.66 ms, qps = 78.980.
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 10 times. total = 123.98 ms, avg = 12.40 ms, qps = 80.659.

Thanks.

This output is pred and gt. The pred should be as same as gt. It seems that you don’t have the problem. Did you run the scripts in NX?
I have try two NX board and have same result.
And you trt model seems be faster.

here is my log:
when times= 1

420 <class 'numpy.float32'>
1000 <class 'numpy.float32'>
420 <class 'numpy.float32'>
80 <class 'numpy.float32'>
400 <class 'numpy.float32'>
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {4: 5} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 82.35 ms, avg = 82.35 ms, qps = 12.143.

when times=10

420 <class 'numpy.float32'>
1000 <class 'numpy.float32'>
420 <class 'numpy.float32'>
80 <class 'numpy.float32'>
400 <class 'numpy.float32'>
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 10 times. total = 261.57 ms, avg = 26.16 ms, qps = 38.230.

Hi,

We try this again on XavierNX with JetPack 4.6.2.
The performance is still higher compared to your result.

tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 14.93 ms, avg = 14.93 ms, qps = 66.967.

May I know which power mode you use?
Could you give JetPack 4.6.2 a try?

Thanks.

my power mode in jtop:


And my jetpack is 4.6.1, I will try 4.6.2

It seems I can get right result in JetPACK 4.6.2 when times = 1

420 <class 'numpy.float32'>
1000 <class 'numpy.float32'>
420 <class 'numpy.float32'>
80 <class 'numpy.float32'>
400 <class 'numpy.float32'>
tracklets_norm_xyz (1, 20, 21) float32
tracklets_norm_xyzs (1, 20, 5, 10) float32
detection_norm_xyz (1, 20, 21) float32
tracklets_motion (1, 20, 4) float32
out_mask (1, 20, 20) float32
diff: {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7} {0: 2, 2: 0, 4: 5, 7: 4, 11: 9, 12: 10, 14: 3, 16: 6, 19: 7}
finished 1 times. total = 73.55 ms, avg = 73.55 ms, qps = 13.597.

Good to know this.
Does JetPack 4.6.2 meet your requirement?

yes, thanks!

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.