Trying to use unified memory with tensorrt on jetson nano

I am trying to use unified memory to speed up performance time but I keep getting the unhelpfull errors:
TensorRT] ERROR: …/rtSafe/cuda/cudaSoftMaxRunner.cpp (123) - Cudnn Error in execute: 8 (CUDNN_STATUS_EXECUTION_FAILED)
[TensorRT] ERROR: FAILED_EXECUTION: std::exception
[TensorRT] ERROR: engine.cpp (179) - Cuda Error in ~ExecutionContext: 719 (unspecified launch failure)
[TensorRT] ERROR: INTERNAL_ERROR: std::exception
[TensorRT] ERROR: Parameter check failed at: …/rtSafe/safeContext.cpp::terminateCommonContext::155, condition: cudnnDestroy(context.cudnn) failure.
[TensorRT] ERROR: Parameter check failed at: …/rtSafe/safeContext.cpp::terminateCommonContext::165, condition: cudaEventDestroy(context.start) failure.
[TensorRT] ERROR: Parameter check failed at: …/rtSafe/safeContext.cpp::terminateCommonContext::170, condition: cudaEventDestroy(context.stop) failure.
[TensorRT] ERROR: …/rtSafe/safeRuntime.cpp (32) - Cuda Error in free: 719 (unspecified launch failure)
terminate called after throwing an instance of ‘nvinfer1::CudaError’
what(): std::exception
Aborted (core dumped)

import os
import numpy as np
import PIL
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit as cudacontext
import torch

device = torch.device("cuda")

def get_engine(onnx_file_path, engine_file_path=None):
    if engine_file_path is None:
        engine_file_path = onnx_file_path.split(".")[0]+".trt"
    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
    if os.path.exists(engine_file_path):
        # If a serialized engine exists, use it instead of building an engine.
        print("Reading engine from file {}".format(engine_file_path))
        with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())
def tenrt_run(engine,myinput):
    img=np.ascontiguousarray(myinput)

    print("create page-locked memory buffers. SIZE:",engine.get_binding_shape(0),engine.get_binding_shape(1))
    stream = cuda.Stream()
    with engine.create_execution_context() as context:
        
        inmemory,outmemory,bindings,stream=unified_mem_mod(engine)
        print('Running inference on image ...')
        inmemory[:,:,:] = img
        for _ in range(30):
            do_inference_uni(context,bindings,stream)

    return outmemory
def do_inference_uni(context,bindings,stream):
    pdb.set_trace()
    context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
    pdb.set_trace()
def unified_mem_mod(engine):
    stream = cuda.Stream()
    bindings = []
    for binding in engine:
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        inshape=tuple(engine.get_binding_shape(0))
        outshape=tuple(engine.get_binding_shape(1))
        print("unified memory shape. in:",inshape,"out:",outshape)
        inmemory=cuda.managed_empty(shape=inshape, dtype=dtype, mem_flags=cuda.mem_attach_flags.GLOBAL)
        outmemory=cuda.managed_empty(shape=outshape, dtype=dtype, mem_flags=cuda.mem_attach_flags.GLOBAL)
        cudacontext.context.synchronize()
        
        bindings.append(int(inmemory.base.get_device_pointer()))
        bindings.append(int(outmemory.base.get_device_pointer()))
    
    return inmemory,outmemory,bindings,stream
if __name__ == "__main__":
    image = PIL.Image.open('testimage.png').convert("RGB").resize((160,160))
    image=(np.array(image)/255).astype(np.float32).transpose(2,0,1)
    image=np.expand_dims(image,axis=0)

    myengine=get_engine(onnx_file_path)
    out=tenrt_run(myengine,image)

changed unified_mem_mod to:

def unified_mem_mod(engine):
    stream = cuda.Stream()
    bindings = []
    dtypes=[]
    for binding in engine:
        dtypes.append(trt.nptype(engine.get_binding_dtype(binding)))

    inshape=tuple(engine.get_binding_shape(0))
    outshape=tuple(engine.get_binding_shape(1))
    inmemory=cuda.managed_empty(shape=inshape, dtype=dtypes[0], mem_flags=cuda.mem_attach_flags.GLOBAL)
    outmemory=cuda.managed_empty(shape=outshape, dtype=dtypes[1], mem_flags=cuda.mem_attach_flags.GLOBAL)
    cudacontext.context.synchronize()
    
    bindings.append(int(inmemory.base.get_device_pointer()))
    bindings.append(int(outmemory.base.get_device_pointer()))
    
    return inmemory,outmemory,bindings,stream