Multiple calls of enqueueV2

I created a network with multiple profiles therefore it can be created more than one context.
If I run the following code the first time everything is fine but for the second time I get the error:
[E] [TRT] 1: [resources.cpp::nvinfer1::ScopedCudaStream::ScopedCudaStream::447] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
The error occurs at the line: cudaMemcpy(outputPred, outputBuffer, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

This is the code:

    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(sample::gLogger);
    engine = runtime->deserializeCudaEngine(data.data(), data.size(), nullptr);
    context = _engine->createExecutionContext();

    for(int i=0; i<10; i++) {
        _context->setOptimizationProfile(i);
        _context->setBindingDimensions(2*i, nvinfer1::Dims4(4,224,224,3));
        _inputIndex = _engine->getBindingIndex("input_layer:0");
        _outputIndex = _engine->getBindingIndex("Identity:0");
        _input_height = _engine->getBindingDimensions(_inputIndex).d[1]; 
        _input_width = _engine->getBindingDimensions(_inputIndex).d[2];
        _input_channels = _engine->getBindingDimensions(_inputIndex).d[3];

        cudaError_t cudaErr = cudaStreamCreate(&_stream);
        int input_batch_size = 1;
        int inputSize = input_batch_size * _input_height * _input_width * _input_channels;
        int outputSize = input_batch_size * _engine->getBindingDimensions(_outputIndex).d[1];
        
        void* inputBuffer;
        cudaMalloc(&inputBuffer, inputSize*sizeof(float));
        void* outputBuffer;
        cudaMalloc(&outputBuffer, outputSize*sizeof(float)) ;

        cv::Mat dummyImg = cv::Mat::ones(224, 224, CV_8UC3);

        int volChl = _input_channels * _input_width;
        int volImg = _input_channels * _input_height * _input_width;
        float* inputImg = (float*) malloc(inputSize*sizeof(float));

        // write dummyImg  to inputImg

        cudaMemcpy(inputBuffer, inputImg, inputSize*sizeof(float), cudaMemcpyHostToDevice);

        float* outputPred = (float*) malloc(outputSize*sizeof(float));
        cudaMemcpy(outputBuffer, outputPred, outputSize*sizeof(float), cudaMemcpyHostToDevice);

        void* buffers[2];
        buffers[_inputIndex] = inputBuffer;
        buffers[_outputIndex] = outputBuffer;

        bool status = _context->enqueueV2(buffers, _stream, nullptr);

        cudaMemcpy(outputPred, outputBuffer, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

        free(outputPred);
        free(inputImg);

        cudaFree(outputBuffer);
        cudaFree(inputBuffer);
  }

My goal is to have multiple contexts that do parallel inference on one GPU.

Hi,
The below link might be useful for you
https://docs.nvidia.com/deeplearning/tensorrt/best-practices/index.html#thread-safety
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-priorities
https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
For multi threading/streaming, will suggest you to use Deepstream or TRITON
For more details, we recommend you to raise the query to the Deepstream or TRITON forum.

Thanks!

In the code cuda memory is allocated, an inference is done and the cuda memory is deallocated. But at the second time it does not work. Therefore I think it is a cuda memory problem because of the call of enqueueV2(…)

I reduced the code to the following:
IExecutionContext *context = engine->createExecutionContext();
if (!context)
return 0;
context->setBindingDimensions(0, Dims4(2,224,224,3));

int inputIndex = engine->getBindingIndex("input_layer:0");
int outputIndex = engine->getBindingIndex("Identity:0");

int input_batch_size = 1; 
int input_height = engine->getBindingDimensions(inputIndex).d[1]; 
int input_width = engine->getBindingDimensions(inputIndex).d[2]; 
int input_channels = engine->getBindingDimensions(inputIndex).d[3]; 
int inputSize = input_batch_size * input_height * input_width * input_channels;
int outputSize = input_batch_size * engine->getBindingDimensions(outputIndex).d[1];

void* inputBuffer;
if(cudaMalloc(&inputBuffer, inputSize*sizeof(float)) != 0)
    return 0;
void* outputBuffer;
if(cudaMalloc(&outputBuffer, outputSize * sizeof(float)) != 0)
    return 0;

cv::Mat dummyImg = cv::Mat::ones(224, 224, CV_8UC3);

int volChl = input_channels * input_width;
int volImg = input_channels * input_height * input_width;

float* inputImg = (float*) malloc(inputSize*sizeof(float));
//write dummyImg to input Img
cudaMemcpy(inputBuffer, inputImg, inputSize*sizeof(float), cudaMemcpyHostToDevice);

float* outputPred = (float*) malloc(outputSize*sizeof(float));
cudaMemcpy(outputBuffer, outputPred, outputSize*sizeof(float), cudaMemcpyHostToDevice);

void* buffers[2];
buffers[inputIndex] = inputBuffer;
buffers[outputIndex] = outputBuffer;

cudaStream_t stream;
cudaError_t cudaErr = cudaStreamCreate(&stream);
if(cudaErr != 0)
    return 0;

bool status = context->enqueueV2(buffers, stream, nullptr);
if (!status)
    return 0;

cudaMemcpy(outputPred, outputBuffer, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

cudaStreamDestroy(stream);
cudaFree(outputBuffer);
cudaFree(inputBuffer);
free(outputPred);
free(inputImg);

IExecutionContext *context1 = engine->createExecutionContext();
if (!context)
    return 0;
context1->setOptimizationProfile(1);
context1->setBindingDimensions(2, Dims4(2,224,224,3));

void* inputBuffer1;
if(cudaMalloc(&inputBuffer1, inputSize*sizeof(float)) != 0)
    return 0;
void* outputBuffer1;
if(cudaMalloc(&outputBuffer1, outputSize * sizeof(float)) != 0)
    return 0;

float* inputImg1 = (float*) malloc(inputSize*sizeof(float));
// write dummyImg to inputImg1
cudaMemcpy(inputBuffer1, inputImg1, inputSize*sizeof(float), cudaMemcpyHostToDevice);

float* outputPred1 = (float*) malloc(outputSize*sizeof(float));
cudaMemcpy(outputBuffer1, outputPred1, outputSize*sizeof(float), cudaMemcpyHostToDevice);

void* buffers1[2];
buffers1[inputIndex] = inputBuffer1;
buffers1[outputIndex] = outputBuffer1;

cudaStream_t stream1;
cudaErr = cudaStreamCreate(&stream1);
if(cudaErr != 0)
    return 0;

status = context->enqueueV2(buffers1, stream, nullptr);
if (!status)
    return 0;

cudaMemcpy(outputPred1, outputBuffer1, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

cudaStreamDestroy(stream1);
cudaFree(outputBuffer1);
cudaFree(inputBuffer1);
free(outputPred1);
free(inputImg1);

And I received the following output/error:
[07/15/2021-10:40:57] [I] [TRT] [MemUsageChange] Init CUDA: CPU +433, GPU +0, now: CPU 19762, GPU 4257 (MiB)
[07/15/2021-10:40:57] [I] [TRT] Loaded engine size: 1604 MB
[07/15/2021-10:40:57] [I] [TRT] [MemUsageSnapshot] deserializeCudaEngine begin: CPU 19762 MiB, GPU 4257 MiB
[07/15/2021-10:41:00] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:00] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +445, GPU +166, now: CPU 20212, GPU 6024 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +214, GPU +170, now: CPU 20426, GPU 6194 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 20426, GPU 6176 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] deserializeCudaEngine end: CPU 20426 MiB, GPU 6176 MiB
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation begin: CPU 18821 MiB, GPU 6176 MiB
[07/15/2021-10:41:01] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +10, now: CPU 18821, GPU 6186 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 18821, GPU 6194 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation end: CPU 18855 MiB, GPU 6369 MiB
[07/15/2021-10:41:02] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation begin: CPU 19287 MiB, GPU 6529 MiB
[07/15/2021-10:41:02] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:02] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 19287, GPU 6537 (MiB)
[07/15/2021-10:41:02] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 19287, GPU 6547 (MiB)
[07/15/2021-10:41:02] [I] [TRT] Could not set default profile 0 for execution context. Profile index must be set explicitly.
[07/15/2021-10:41:02] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation end: CPU 19287 MiB, GPU 6722 MiB
[07/15/2021-10:41:02] [E] [TRT] 1: [hardwareContext.cpp::nvinfer1::rt::CommonContext::configure::92] Error Code 1: Cudnn (CUDNN_STATUS_MAPPING_ERROR)

Could you let us know following env details. Also please share us complete issue repro script.

TensorRT Version :
GPU Type:
Nvidia Driver Version :
CUDA Version :
CUDNN Version :
Operating System + Version :
Python Version (if applicable) :
TensorFlow Version (if applicable) :
PyTorch Version (if applicable) :
Baremetal or Container (if container which image + tag) :
Container:

TensorRT Version : 8.0.1.6
GPU Type: NVIDIA TITAN RTX
Nvidia Driver Version : 27.21.14.6589
CUDA Version : 11.3
CUDNN Version : 8.2.1
Operating System + Version : Windows 10
Python Version (if applicable) : 3.7.7
TensorFlow Version (if applicable) : -
PyTorch Version (if applicable) : -
Baremetal or Container (if container which image + tag) : -
Container: -

@OpDaSo_B,

Your Nvidia driver version looks unusual, Could you please share nvidia-smi output. Also we recommend you to make sure CUDA toolkit installed correctly.

±-------------------------------------------------------------------------------+
| NVIDIA-SMI 465.89 Driver Version: 465.89 CUDA Version: 11.3 |
|-------------------------------±------------------------±---------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+============|
| 0 NVIDIA TITAN RTX WDDM | 00000000:09:00.0 Off | N/A |
| 40% 42C P8 22W / 280W | 774MiB / 24576MiB | 2% Default |
| | | N/A |
±------------------------------±---------------------±---------------------+

Hi @OpDaSo_B,

Based on the above code shared looks like you’re calling second enqueueV2 for the same context and same destroyed CUDA stream. Did you mean to call second enqueueV2 for contex1 and stream1 created ?

I think this is leading the problem.