Multiple calls of enqueueV2

I created a network with multiple profiles therefore it can be created more than one context.
If I run the following code the first time everything is fine but for the second time I get the error:
[E] [TRT] 1: [resources.cpp::nvinfer1::ScopedCudaStream::ScopedCudaStream::447] Error Code 1: Cuda Runtime (an illegal memory access was encountered)
The error occurs at the line: cudaMemcpy(outputPred, outputBuffer, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

This is the code:

    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(sample::gLogger);
    engine = runtime->deserializeCudaEngine(data.data(), data.size(), nullptr);
    context = _engine->createExecutionContext();

    for(int i=0; i<10; i++) {
        _context->setOptimizationProfile(i);
        _context->setBindingDimensions(2*i, nvinfer1::Dims4(4,224,224,3));
        _inputIndex = _engine->getBindingIndex("input_layer:0");
        _outputIndex = _engine->getBindingIndex("Identity:0");
        _input_height = _engine->getBindingDimensions(_inputIndex).d[1]; 
        _input_width = _engine->getBindingDimensions(_inputIndex).d[2];
        _input_channels = _engine->getBindingDimensions(_inputIndex).d[3];

        cudaError_t cudaErr = cudaStreamCreate(&_stream);
        int input_batch_size = 1;
        int inputSize = input_batch_size * _input_height * _input_width * _input_channels;
        int outputSize = input_batch_size * _engine->getBindingDimensions(_outputIndex).d[1];
        
        void* inputBuffer;
        cudaMalloc(&inputBuffer, inputSize*sizeof(float));
        void* outputBuffer;
        cudaMalloc(&outputBuffer, outputSize*sizeof(float)) ;

        cv::Mat dummyImg = cv::Mat::ones(224, 224, CV_8UC3);

        int volChl = _input_channels * _input_width;
        int volImg = _input_channels * _input_height * _input_width;
        float* inputImg = (float*) malloc(inputSize*sizeof(float));

        // write dummyImg  to inputImg

        cudaMemcpy(inputBuffer, inputImg, inputSize*sizeof(float), cudaMemcpyHostToDevice);

        float* outputPred = (float*) malloc(outputSize*sizeof(float));
        cudaMemcpy(outputBuffer, outputPred, outputSize*sizeof(float), cudaMemcpyHostToDevice);

        void* buffers[2];
        buffers[_inputIndex] = inputBuffer;
        buffers[_outputIndex] = outputBuffer;

        bool status = _context->enqueueV2(buffers, _stream, nullptr);

        cudaMemcpy(outputPred, outputBuffer, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

        free(outputPred);
        free(inputImg);

        cudaFree(outputBuffer);
        cudaFree(inputBuffer);
  }

My goal is to have multiple contexts that do parallel inference on one GPU.

Hi,
The below link might be useful for you
https://docs.nvidia.com/deeplearning/tensorrt/best-practices/index.html#thread-safety
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#stream-priorities
https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
For multi threading/streaming, will suggest you to use Deepstream or TRITON
For more details, we recommend you to raise the query to the Deepstream or TRITON forum.

Thanks!

In the code cuda memory is allocated, an inference is done and the cuda memory is deallocated. But at the second time it does not work. Therefore I think it is a cuda memory problem because of the call of enqueueV2(…)

I reduced the code to the following:
IExecutionContext *context = engine->createExecutionContext();
if (!context)
return 0;
context->setBindingDimensions(0, Dims4(2,224,224,3));

int inputIndex = engine->getBindingIndex("input_layer:0");
int outputIndex = engine->getBindingIndex("Identity:0");

int input_batch_size = 1; 
int input_height = engine->getBindingDimensions(inputIndex).d[1]; 
int input_width = engine->getBindingDimensions(inputIndex).d[2]; 
int input_channels = engine->getBindingDimensions(inputIndex).d[3]; 
int inputSize = input_batch_size * input_height * input_width * input_channels;
int outputSize = input_batch_size * engine->getBindingDimensions(outputIndex).d[1];

void* inputBuffer;
if(cudaMalloc(&inputBuffer, inputSize*sizeof(float)) != 0)
    return 0;
void* outputBuffer;
if(cudaMalloc(&outputBuffer, outputSize * sizeof(float)) != 0)
    return 0;

cv::Mat dummyImg = cv::Mat::ones(224, 224, CV_8UC3);

int volChl = input_channels * input_width;
int volImg = input_channels * input_height * input_width;

float* inputImg = (float*) malloc(inputSize*sizeof(float));
//write dummyImg to input Img
cudaMemcpy(inputBuffer, inputImg, inputSize*sizeof(float), cudaMemcpyHostToDevice);

float* outputPred = (float*) malloc(outputSize*sizeof(float));
cudaMemcpy(outputBuffer, outputPred, outputSize*sizeof(float), cudaMemcpyHostToDevice);

void* buffers[2];
buffers[inputIndex] = inputBuffer;
buffers[outputIndex] = outputBuffer;

cudaStream_t stream;
cudaError_t cudaErr = cudaStreamCreate(&stream);
if(cudaErr != 0)
    return 0;

bool status = context->enqueueV2(buffers, stream, nullptr);
if (!status)
    return 0;

cudaMemcpy(outputPred, outputBuffer, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

cudaStreamDestroy(stream);
cudaFree(outputBuffer);
cudaFree(inputBuffer);
free(outputPred);
free(inputImg);

IExecutionContext *context1 = engine->createExecutionContext();
if (!context)
    return 0;
context1->setOptimizationProfile(1);
context1->setBindingDimensions(2, Dims4(2,224,224,3));

void* inputBuffer1;
if(cudaMalloc(&inputBuffer1, inputSize*sizeof(float)) != 0)
    return 0;
void* outputBuffer1;
if(cudaMalloc(&outputBuffer1, outputSize * sizeof(float)) != 0)
    return 0;

float* inputImg1 = (float*) malloc(inputSize*sizeof(float));
// write dummyImg to inputImg1
cudaMemcpy(inputBuffer1, inputImg1, inputSize*sizeof(float), cudaMemcpyHostToDevice);

float* outputPred1 = (float*) malloc(outputSize*sizeof(float));
cudaMemcpy(outputBuffer1, outputPred1, outputSize*sizeof(float), cudaMemcpyHostToDevice);

void* buffers1[2];
buffers1[inputIndex] = inputBuffer1;
buffers1[outputIndex] = outputBuffer1;

cudaStream_t stream1;
cudaErr = cudaStreamCreate(&stream1);
if(cudaErr != 0)
    return 0;

status = context->enqueueV2(buffers1, stream, nullptr);
if (!status)
    return 0;

cudaMemcpy(outputPred1, outputBuffer1, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

cudaStreamDestroy(stream1);
cudaFree(outputBuffer1);
cudaFree(inputBuffer1);
free(outputPred1);
free(inputImg1);

And I received the following output/error:
[07/15/2021-10:40:57] [I] [TRT] [MemUsageChange] Init CUDA: CPU +433, GPU +0, now: CPU 19762, GPU 4257 (MiB)
[07/15/2021-10:40:57] [I] [TRT] Loaded engine size: 1604 MB
[07/15/2021-10:40:57] [I] [TRT] [MemUsageSnapshot] deserializeCudaEngine begin: CPU 19762 MiB, GPU 4257 MiB
[07/15/2021-10:41:00] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:00] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +445, GPU +166, now: CPU 20212, GPU 6024 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +214, GPU +170, now: CPU 20426, GPU 6194 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 20426, GPU 6176 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] deserializeCudaEngine end: CPU 20426 MiB, GPU 6176 MiB
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation begin: CPU 18821 MiB, GPU 6176 MiB
[07/15/2021-10:41:01] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +10, now: CPU 18821, GPU 6186 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 18821, GPU 6194 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation end: CPU 18855 MiB, GPU 6369 MiB
[07/15/2021-10:41:02] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation begin: CPU 19287 MiB, GPU 6529 MiB
[07/15/2021-10:41:02] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:02] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 19287, GPU 6537 (MiB)
[07/15/2021-10:41:02] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 19287, GPU 6547 (MiB)
[07/15/2021-10:41:02] [I] [TRT] Could not set default profile 0 for execution context. Profile index must be set explicitly.
[07/15/2021-10:41:02] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation end: CPU 19287 MiB, GPU 6722 MiB
[07/15/2021-10:41:02] [E] [TRT] 1: [hardwareContext.cpp::nvinfer1::rt::CommonContext::configure::92] Error Code 1: Cudnn (CUDNN_STATUS_MAPPING_ERROR)

Could you let us know following env details. Also please share us complete issue repro script.

TensorRT Version :
GPU Type:
Nvidia Driver Version :
CUDA Version :
CUDNN Version :
Operating System + Version :
Python Version (if applicable) :
TensorFlow Version (if applicable) :
PyTorch Version (if applicable) :
Baremetal or Container (if container which image + tag) :
Container:

TensorRT Version : 8.0.1.6
GPU Type: NVIDIA TITAN RTX
Nvidia Driver Version : 27.21.14.6589
CUDA Version : 11.3
CUDNN Version : 8.2.1
Operating System + Version : Windows 10
Python Version (if applicable) : 3.7.7
TensorFlow Version (if applicable) : -
PyTorch Version (if applicable) : -
Baremetal or Container (if container which image + tag) : -
Container: -

@OpDaSo_B,

Your Nvidia driver version looks unusual, Could you please share nvidia-smi output. Also we recommend you to make sure CUDA toolkit installed correctly.

±-------------------------------------------------------------------------------+
| NVIDIA-SMI 465.89 Driver Version: 465.89 CUDA Version: 11.3 |
|-------------------------------±------------------------±---------------------+
| GPU Name TCC/WDDM | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+============|
| 0 NVIDIA TITAN RTX WDDM | 00000000:09:00.0 Off | N/A |
| 40% 42C P8 22W / 280W | 774MiB / 24576MiB | 2% Default |
| | | N/A |
±------------------------------±---------------------±---------------------+

Hi @OpDaSo_B,

Based on the above code shared looks like you’re calling second enqueueV2 for the same context and same destroyed CUDA stream. Did you mean to call second enqueueV2 for contex1 and stream1 created ?

I think this is leading the problem.

Yes, in the above code is a mistake.
I created a TensorRT engine with an input size of [-1, 224, 224, 3] and add more profiles during the creation of the engine.

Here, is the correct code:

IExecutionContext* context = engine->createExecutionContext();
if (!context)
    return 0;

cudaStream_t stream;
cudaError_t cudaErr = cudaStreamCreate(&stream);
if(cudaErr != 0)
    return 0;

cv::Mat dummyImg = cv::Mat::ones(224, 224, CV_8UC3);

int inputIndex = engine->getBindingIndex("input_layer:0");
int outputIndex = engine->getBindingIndex("Identity:0");
int input_batch_size = batchMat.size();
int input_height = engine->getBindingDimensions(inputIndex).d[1];
int input_width = engine->getBindingDimensions(inputIndex).d[2];
int input_channels = engine->getBindingDimensions(inputIndex).d[3];
int inputSize = input_batch_size * input_height * input_width * input_channels;
int outputSize = input_batch_size * engine->getBindingDimensions(outputIndex).d[1];

void* inputBuffer;
if(cudaMalloc(&inputBuffer, inputSize*sizeof(float)) != 0)
    return 0;
void* outputBuffer;
if(cudaMalloc(&outputBuffer, outputSize * sizeof(float)) != 0)
    return 0;

int volChl = input_channels * input_width;
int volImg = input_channels * input_height * input_width;

float* inputImg = (float*) malloc(inputSize*sizeof(float));

//write dummyImg to inputImg

cudaMemcpyAsync(inputBuffer, inputImg, inputSize*sizeof(float), cudaMemcpyHostToDevice, stream);

float* outputPred = (float*) malloc(outputSize*sizeof(float));

cudaMemcpyAsync(outputBuffer, outputPred, outputSize*sizeof(float), cudaMemcpyHostToDevice, stream);

void* buffers[2];
buffers[inputIndex] = inputBuffer;
buffers[outputIndex] = outputBuffer;

context->setOptimizationProfile(0);
context->setBindingDimensions(0, Dims4(input_batch_size,224,224,3));

bool status = context->enqueueV2(buffers, stream, nullptr);
if (!status)
    return 0;

cudaMemcpyAsync(outputPred, outputBuffer, outputSize*sizeof(float), cudaMemcpyDeviceToHost, stream);

cudaStreamSynchronize(stream);

cudaFree(outputBuffer);
cudaFree(inputBuffer);
free(outputPred);
free(inputImg);
cudaStreamDestroy(stream);

IExecutionContext *context1 = engine->createExecutionContext();
if (!context1)
    return 0;

cudaStream_t stream1;
cudaError_t cudaError = cudaStreamCreate(&stream1); 
if(cudaError != 0)
    return 0;
context1->setOptimizationProfile(1);
context1->setBindingDimensions(2, Dims4(input_batch_size,224,224,3));

void* inputBuffer1;
if(cudaMalloc(&inputBuffer1, inputSize*sizeof(float)) != 0)
    return 0;
void* outputBuffer1;
if(cudaMalloc(&outputBuffer1, outputSize * sizeof(float)) != 0)
    return 0;

float* inputImg1 = (float*) malloc(inputSize*sizeof(float));
//write dummyImg to inputImg1
cudaMemcpy(inputBuffer1, inputImg1, inputSize*sizeof(float), cudaMemcpyHostToDevice);

float* outputPred1 = (float*) malloc(outputSize*sizeof(float));
cudaMemcpy(outputBuffer1, outputPred1, outputSize*sizeof(float), cudaMemcpyHostToDevice);

void* buffers1[2];
buffers1[inputIndex] = inputBuffer1;
buffers1[outputIndex] = outputBuffer1;

status = context1->enqueueV2(buffers1, stream1, nullptr);
if (!status)
    return 0;

cudaMemcpy(outputPred1, outputBuffer1, outputSize*sizeof(float), cudaMemcpyDeviceToHost);

cudaStreamDestroy(stream1);
cudaFree(outputBuffer1);
cudaFree(inputBuffer1);
free(outputPred1);
free(inputImg1);

The code works but the results of context1->enqueueV2(…) in the variable outputPred1 are false. All values in the variable outputPred1 are equal to -4.31602e+08. Therefore I think the call of context1->enqueueV2(…) does not do the right things…

@OpDaSo_B,

Thanks for confirming original issue is resolved. Could you please share more details on second issue you mentioned and please provide us minimal issue repro script/model.

Here is the log output:

[07/29/2021-12:36:07] [I] [TRT] [MemUsageChange] Init CUDA: CPU +266, GPU +0, now: CPU 13735, GPU 4257 (MiB)
[07/29/2021-12:36:07] [I] [TRT] Loaded engine size: 1604 MB
[07/29/2021-12:36:07] [I] [TRT] [MemUsageSnapshot] deserializeCudaEngine begin: CPU 13735 MiB, GPU 4257 MiB
[07/29/2021-12:36:11] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/29/2021-12:36:11] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +461, GPU +166, now: CPU 14169, GPU 6024 (MiB)
[07/29/2021-12:36:11] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +210, GPU +170, now: CPU 14379, GPU 6194 (MiB)
[07/29/2021-12:36:11] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 14379, GPU 6176 (MiB)
[07/29/2021-12:36:11] [I] [TRT] [MemUsageSnapshot] deserializeCudaEngine end: CPU 14379 MiB, GPU 6176 MiB
[07/29/2021-12:36:11] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation begin: CPU 12772 MiB, GPU 6176 MiB
[07/29/2021-12:36:11] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/29/2021-12:36:11] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +10, now: CPU 12772, GPU 6186 (MiB)
[07/29/2021-12:36:11] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 12772, GPU 6194 (MiB)
[07/29/2021-12:36:11] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation end: CPU 12806 MiB, GPU 6369 MiB
output of outputPred:
1.82646e-06
9.66485e-05
0.999902
0.00256934
0.996522
0.000908257
[07/29/2021-12:36:12] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation begin: CPU 13237 MiB, GPU 6529 MiB
[07/29/2021-12:36:12] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/29/2021-12:36:12] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 13237, GPU 6537 (MiB)
[07/29/2021-12:36:12] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 13237, GPU 6547 (MiB)
[07/29/2021-12:36:12] [I] [TRT] Could not set default profile 0 for execution context. Profile index must be set explicitly.
[07/29/2021-12:36:12] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation end: CPU 13237 MiB, GPU 6722 MiB
output of outputPred1:
-4.31602e+08
-4.31602e+08
-4.31602e+08
-4.31602e+08
-4.31602e+08
-4.31602e+08

@OpDaSo_B,

As mentioned in my previous reply, could you share us issue repro model and complete script to try from our end for better debugging.

I wrote you a PM.

Sorry for the delayed response. Have you tried running using onnx-runtime, please let us know if you face the same issue.

Also you can use Polygraphy — Polygraphy 0.31.1 documentation to debug.

Thank you.

Hi,
I do not run my model on onnx-runtime.
Did you test my model on TensorRT?

Hi,

Are you still facing this issue.