Python multi-process TensorRT inference resulting in `Error Code 1: Cudnn (CUDNN_STATUS_MAPPING_ERROR)`

Hi,

I’m trying to run TensorRT inference with python multiprocessing. Whenever there is a call to context.execute_async, I get the error Error Code 1: Cudnn (CUDNN_STATUS_MAPPING_ERROR) which gets repeatedly printed on the screen on my Xavier NX with versions:

python:   3.8.12
cuda:     11.4
cudnn:    8.6.0
TensorRT: 8.5.3.2
Jetpack:  5.1.3

I have looked at different places to incorporate the suggestions like defining a custom cuda context

The engine loads successfully and this doesn’t happen outside a mult-processing/ multi-threading scenario (works well normally)

Although I can’t share the engine file and the actual code, my code skeleton looks like this

import numpy as np
import pycuda.driver as cuda
import tensorrt as trt


class TensorRTModel:
    def __init__(self, engine_path: str,  batch_size: int =1):
        cuda.init()
        self.engine_path = engine_path
        self.batch_size = batch_size
        trt_logger = trt.Logger(trt.Logger.INFO)

        # Load engine
        self.engine, self.context = self.load_engine(
            engine_path=engine_path,
            logger=trt_logger,
        )
        device = cuda.Device(0)
        self.cuda_context = device.make_context()
        self.stream = cuda.Stream()
        
        # inputs and outputs are classes each with variables for host_memory and device_memory
        self.inputs, self.outputs, self.bindings = self.allocate_buffers()

    def predict(self, x):
        np.copyto(self.inputs[0].host_memory, x.ravel())
        self.cuda_context.push()
        for input in self.inputs:
            cuda.memcpy_htod_async(
                input.device_memory,
                input.host_memory,
                self.stream
            )
        # Run inference
        # This is where the error originates from
        self.context.execute_async(
            bindings=self.bindings,
            stream_handle=self.stream.handle
        ) # tried execute_async_v2 too
        # Transfer prediction output from the GPU.
        for output in self.outputs:
            cuda.memcpy_dtoh_async(
                output.host_memory,
                output.device_memory,
                self.stream
            ) 
        # Synchronize the stream
        self.stream.synchronize()
        self.cuda_context.pop()
        
    def __del__(self):
        self.cuda_context.detach()
        del self.cuda_context

and the main file that drives the multi-processing inference looks something like this:

import argparse
from time import time

import numpy as np

import multiprocessing
from tensorrt_model import TensorRTModel

def benchmark(
    model_path: str,
    tag: str,
    return_dict: dict,
    batch_size:int = 1,
    num_tests: int = 100,
    warm_up: int = 10
):
    model_fn = TensorRTModel(model_path=model_path)
    
    inp = np.random.rand(1, 640, 640, 3).astype(np.float32)
    time_list = list()
    
    for _ in range(warm_up):
        model_fn.predict(inp)
    
    for _ in range(num_tests):
        t_start = time()
        model_fn.predict(inp)
        t_end = time()
        t_elapsed = (t_end - t_start)
        time_list.append(t_elapsed)
    del model_fn
    
    avg_time = np.mean(time_list)
    return_dict[tag]=avg_time


def main():
    try:
        multiprocessing.set_start_method('spawn', force=True)
        print(f"Multiprocessing start method: SPAWN")
    except:
        pass

    parser = argparse.ArgumentParser(
        'multi-process benchmarking for TensorRT models',
        description='This script performs benchmarking of TensorRT models in a multi-processed fashion'
    )
    parser.add_argument(
        '-n',
        '--num-process',
        dest='num_parallel_processes',
        type=int,
        default=1,
        help='Number of parallel process to run benchmarking on'
    )
    parser.add_argument(
        '-m',
        '--model-path',
        dest='model_path',
        type=str,
        default='model.engine',
        help='Path to the model/ TensorRT Engine'
    )
    parser.add_argument(
        '-b',
        '--batch-size',
        dest='batch_size',
        type=int,
        default=1,
        help='batch size'
    )
    args = parser.parse_args()
    
    manager = multiprocessing.Manager()
    return_dict = manager.dict()
    
    processes = list()
    
    for idx, i in enumerate(range(args.num_parallel_processes)):
        p = multiprocessing.Process(target=benchmark, args=(args.model_path, f"process-{idx}", return_dict, args.batch_size,))
        processes.append(p)
        p.start()

    for p in processes:
        p.join()
    
    print(return_dict)
    
if __name__=='__main__':
    main()

Any help in solving the aforementioned error is appreciated. Thanks in advance…

Hi,

Could you share why you pushed the context after copying?
Could you try to change the order to see if it works?

Thanks.

Thank you for responding and sorry for the late reply…
I tried changing the order too… no luck - I’m getting the same error