Description
I’m using TensorRT 8 python API and when I create two contexts concurrently, it throws a segmentation fault.
Environment
TensorRT Version: 8.0.0.3
GPU Type: T4
Nvidia Driver Version: 450
CUDA Version: 11.0
CUDNN Version: 8.2.0
Operating System + Version: CENTOS 7
Python Version (if applicable): 3.7.10
TensorFlow Version (if applicable):
PyTorch Version (if applicable):
Baremetal or Container (if container which image + tag):
Relevant Files
I have two engines as shown below.
engine1.trt (2.5 MB)
engine2.trt (3.2 MB)
Steps To Reproduce
Here is the code to reproduce.
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda:
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
"""Within this context, host_mom means the cpu memory and device means the GPU memory
"""
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TRTInference:
def __init__(self, engine_file_path, model, batch_size=1):
self.engine_file_path = engine_file_path
self.model = model
self.batch_size = batch_size
self.TRT_LOGGER = trt.Logger()
self.engine = self.get_engine()
self.context = self.get_context()
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
def get_engine(self):
with open(self.engine_file_path, "rb") as f, trt.Runtime(self.TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
def get_context(self):
return self.engine.create_execution_context()
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in self.engine:
if self.engine.binding_is_input(binding):
if self.model == "model2":
size = trt.volume([1, 3, 224, 224]) * self.engine.max_batch_size
else:
size = trt.volume([1, 3, 320, 240]) * self.engine.max_batch_size
else:
if self.model == "model2":
size = trt.volume([1, 2]) * self.engine.max_batch_size
else:
output_shape = self.engine.get_binding_shape(binding)
size = trt.volume([1, 4420, output_shape[-1]]) * self.engine.max_batch_size
dtype = trt.nptype(self.engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def do_inference(self, img, img_h, img_w):
self.inputs[0].host = img
self.context.active_optimization_profile = 0
self.context.set_binding_shape(0, (self.batch_size, 3, img_h, img_w))
# Transfer data from CPU to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, self.stream) for inp in self.inputs]
# Run inference.
self.context.execute_async(batch_size=self.batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, self.stream) for out in self.outputs]
# Synchronize the stream
self.stream.synchronize()
# Return only the host outputs.
trt_outputs = [out.host for out in self.outputs]
if self.model == "model2":
res = np.expand_dims(trt_outputs[0][:2], axis=0)
else:
confidences_shape, locations_shape = self.context.get_binding_shape(2), self.context.get_binding_shape(1)
confidences = np.reshape(trt_outputs[1][:confidences_shape[0] * confidences_shape[1] * confidences_shape[2]], confidences_shape)
locations = np.reshape(trt_outputs[0][:locations_shape[0] * locations_shape[1] * locations_shape[2]], locations_shape)
res = (confidences, locations)
return res
if __name__ == "__main__":
model1 = TRTInference(path_to_engine1, "model1", batch_size=1)
model2 = TRTInference(path_to_engine2, "model2", batch_size=1)
Some explanations:
class TRTInference
is a public class to create two engines concurrently. I found that if I commented self.context = self.get_context()
and self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
in the __init__
of class TRTInference
, it runs well.
And then I uncommented self.context = self.get_context()
, segmentation fault shows up. So it seems like the two contexts may have some confict.
The code is just to pinpoint the problem. And if I use my own data to do the whole inference process, it can output the correct result but still throws a segmentation fault after outputting.
Traceback:
[TensorRT] WARNING: The logger passed into createInferRuntime differs from one already assigned, 0x557415842ea0, logger not updated.
[1] 4947 segmentation fault (core dumped) python detect.py