Description
I am trying to use Pycuda with Tensorrt for model inferencing on Jetson Nano.
However, I got stuck on how to release the memory of the previous occupied model. I have tried to delete the cuda_context as well as the engine_context and the engine file, but none of those works
Of course, it will work if I terminate my script or put it in a separate process and terminate it. But I just wonder if there is another way that I can clear up this memory directly.
I notice in C++, there is a destroy method, but it seems like the del function doesn’t behave correctly
Environment
**Jetson Env
TensorRT Version: 7.1.3.0
CUDA Version: 10.2
Steps To Reproduce
class TrtModel():
def __init__(self,engine_path,config):
assert trt and cuda, "Can't inference by TRT on this device"
device = cuda.Device(0)
self.cuda_ctx = device.make_context()
self.config = config
self.__dict__.update((k, v) for k, v in kwargs.items())
self.channel_first = False
self.img_shape = (0,0)
logger = trt.Logger(trt.Logger.WARNING)
trt_runtime = trt.Runtime(logger)
#Load model
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
self.engine = trt_runtime.deserialize_cuda_engine(engine_data)
self.inputs,self.outputs,self.bindings,self.stream = self.allocate(self.engine)
self.context = self.engine.create_execution_context()
self.cuda_ctx.pop()
def allocate(self,engine):
Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'host_mem', 'device_mem'))
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for index in range(engine.num_bindings):
name = engine.get_binding_name(index)
dtype = trt.nptype(engine.get_binding_dtype(index))
shape = tuple(engine.get_binding_shape(index))
size = trt.volume(shape)
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if engine.binding_is_input(engine[index]):
inputs.append(Binding(name, dtype, shape, host_mem, device_mem))
else:
outputs.append(Binding(name, dtype, shape, host_mem, device_mem))
if inputs[0].shape[1] == 3:
self.channel_first = True
self.img_shape = inputs[0].shape[2:]
else:
self.img_shape = inputs[0].shape[1:3]
return inputs,outputs,bindings,stream
def inference(self,im,batch_size=1):
self.cuda_ctx.push()
assert im.shape == self.inputs[0].shape, (im.shape, self.inputs[0].shape)
im = im.astype(self.inputs[0].dtype)
np.copyto(self.inputs[0].host_mem,im.ravel())
for inp in self.inputs:
cuda.memcpy_htod_async(inp.device_mem, inp.host_mem, self.stream)
#execute_async_v2 ignore the batch_size
self.context.execute_async(batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
for out in self.outputs:
cuda.memcpy_dtoh_async(out.host_mem, out.device_mem, self.stream)
self.stream.synchronize()
self.cuda_ctx.pop()
return {out.name: out.host_mem.reshape(out.shape) for out in self.outputs}
def __del__(self):
try:
for inp in self.inputs:
inp.device_mem.free()
for out in self.outputs:
out.device_mem.free()
LOGGER.info('Free Allocate GPU Memory')
self.cuda_ctx.pop()
except:
pass
del self.context
del self.engine
del self.cuda_ctx
del self.stream
del self.outputs
del self.inputs
gc.collect()