cuda error in TensorRT5 when using DALI preprocessing

I use DALI to preprocess my data, then input pipe_out of DALI to tensorRT inference.I got cuda error:
Cuda error in file src/implicit_gemm.cu at line 1214: invalid resource handle
[TensorRT] ERROR: cuda/customWinogradConvActLayer.cpp (310) - Cuda Error in execute: 33
[TensorRT] ERROR: cuda/customWinogradConvActLayer.cpp (310) - Cuda Error in execute: 33

Partial code:
pipe_out = pipe.run()
pre_input, labels = pipe_out
pre_input_cpu = pre_input.asCPU()
pre_input_tensor = pre_input_cpu.as_tensor()
pre_input_ = np.array(pre_input_tensor)
input_ = np.array(pre_input_.ravel())
np.copyto(pagelocked_buffer, input_)
[output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)

Error happens in code “[output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)”

I got correct output in DALI and I got correct output in TensorRT5 without DALI.

Interesting that you see no errors when executing DALI or TRT separately. CUDA error 33 indicates that a resource handle passed to the API call was not valid. So very likely you are re-using a handle that was in DALI context passing it to TRT context?

Yes,you are right. But how to use DALI and TRT together?
The same error happens in using tensorflow-gpu and TRT together.
I think TRT will throw this error if any other process occupies gpu resources.

To help us debug, can you share a small repro that demonstrates DALI + TRT or TF + TRT together causes the CUDA error you are seeing?

DALI+TRT:
You need common.py located in “TensorRT-5.0.2.6/samples/python”. You also need “resnet_v1_50.uff” generated by yourself.

[sample.py]

This sample uses a UFF MNIST model to create a TensorRT Inference Engine

import numpy as np
import tensorrt as trt
import sys, os
import common
import argparse
from nvidia.dali.pipeline import Pipeline
import pycuda.driver as cuda
import pycuda.autoinit

parser = argparse.ArgumentParser()
parser.add_argument(‘–data’, help=‘test data’)
args = parser.parse_args()

You can set the logger severity higher to suppress messages (or lower to display more messages).

TRT_LOGGER = trt.Logger(trt.Logger.INFO)

import nvidia.dali.ops as ops
import nvidia.dali.types as types

batch_size = 1

class SimplePipeline(Pipeline):
def init(self, batch_size, num_threads, device_id, image_dir):
super(SimplePipeline, self).init(batch_size, num_threads, device_id, seed = 12)
self.input = ops.FileReader(file_root = image_dir)
# instead of path to file directory file with pairs image_name image_label_value can be provided
# self.input = ops.FileReader(file_root = image_dir, file_list = image_dir + ‘/file_list.txt’)
self.decode = ops.HostDecoder(output_type = types.RGB)
self.resize = ops.Resize(device=“gpu”, resize_shorter=256.)
self.cmnp = ops.CropMirrorNormalize(device=“gpu”,
output_dtype=types.FLOAT,
output_layout = types.NCHW,
crop = (224, 224),
image_type = types.RGB,
mean = [123.68, 116.78, 103.94],
std = [1., 1., 1.])
self.uniform = ops.Uniform(range = (0.5, 0.5))

def define_graph(self):
    jpegs, labels = self.input()
    images = self.decode(jpegs)
    #tensor = TensorCPU(images)
    #size = tensor.shape()
    #print(size)
    resize = self.resize(images.gpu())
    output = self.cmnp(resize,crop_pos_x = self.uniform(),
                       crop_pos_y = self.uniform())
    return (output, labels)

class ModelData(object):
MODEL_FILE = os.path.join(os.path.dirname(file), “models/resnet_v1_50.uff”)
INPUT_NAME =“input”
INPUT_SHAPE = (3, 224, 224)
OUTPUT_NAME = “resnet_v1_50/SpatialSqueeze”

def build_engine(model_file):
# For more information on TRT basics, refer to the introductory samples.
print(“build engine begin\n”)
with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.UffParser() as parser:
builder.max_batch_size = 1
builder.max_workspace_size = common.GiB(1)
# Parse the Uff Network
parser.register_input(ModelData.INPUT_NAME, ModelData.INPUT_SHAPE, trt.UffInputOrder.NHWC)
parser.register_output(ModelData.OUTPUT_NAME)
parser.parse(model_file, network)
# Build and return an engine.
return builder.build_cuda_engine(network)

def main():
#data_path = common.find_sample_data(description=“Runs an MNIST network using a UFF model file”, subfolder=“mnist”)
model_file = ModelData.MODEL_FILE
with build_engine(model_file) as engine:
# Build an engine, allocate buffers and create a stream.
# For more information on buffer allocation, refer to the introductory samples.
inputs, outputs, bindings, stream = common.allocate_buffers(engine)
pagelocked_buffer=inputs[0].host
with engine.create_execution_context() as context:
pipe = SimplePipeline(batch_size, 1, 0, args.data)
pipe.build()

        pipe_out = pipe.run()
        pre_input, labels = pipe_out
        #print(pre_input.asCPU().at(0).shape)
        pre_input_cpu = pre_input.asCPU()
        pre_input_tensor = pre_input_cpu.as_tensor()
        pre_input_ = np.array(pre_input_tensor)
        
        input_ = np.array(pre_input_.ravel())
        np.copyto(pagelocked_buffer, input_)
        
        [output] = common.do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
        #print(output)

if name == ‘main’:
main()

Can you please upload the source (indentation is missing in your sample) and uff.

Here’s how to attach files to the post, or you can use google drive.
https://devtalk.nvidia.com/default/topic/1043347/announcements/attaching-files-to-forum-topics-posts/

Due to company policy, I have no right to upload files.
You can use any other uff files instead of my resnet_v1_50.uff.
I think you can reproduce this issue if you are familiar with TRT.
“common.py” is the sample code in tensorrt5

is there any python example to using TRT+DALI?