In the code cuda memory is allocated, an inference is done and the cuda memory is deallocated. But at the second time it does not work. Therefore I think it is a cuda memory problem because of the call of enqueueV2(…)
I reduced the code to the following:
IExecutionContext *context = engine->createExecutionContext();
if (!context)
return 0;
context->setBindingDimensions(0, Dims4(2,224,224,3));
int inputIndex = engine->getBindingIndex("input_layer:0");
int outputIndex = engine->getBindingIndex("Identity:0");
int input_batch_size = 1;
int input_height = engine->getBindingDimensions(inputIndex).d[1];
int input_width = engine->getBindingDimensions(inputIndex).d[2];
int input_channels = engine->getBindingDimensions(inputIndex).d[3];
int inputSize = input_batch_size * input_height * input_width * input_channels;
int outputSize = input_batch_size * engine->getBindingDimensions(outputIndex).d[1];
void* inputBuffer;
if(cudaMalloc(&inputBuffer, inputSize*sizeof(float)) != 0)
return 0;
void* outputBuffer;
if(cudaMalloc(&outputBuffer, outputSize * sizeof(float)) != 0)
return 0;
cv::Mat dummyImg = cv::Mat::ones(224, 224, CV_8UC3);
int volChl = input_channels * input_width;
int volImg = input_channels * input_height * input_width;
float* inputImg = (float*) malloc(inputSize*sizeof(float));
//write dummyImg to input Img
cudaMemcpy(inputBuffer, inputImg, inputSize*sizeof(float), cudaMemcpyHostToDevice);
float* outputPred = (float*) malloc(outputSize*sizeof(float));
cudaMemcpy(outputBuffer, outputPred, outputSize*sizeof(float), cudaMemcpyHostToDevice);
void* buffers[2];
buffers[inputIndex] = inputBuffer;
buffers[outputIndex] = outputBuffer;
cudaStream_t stream;
cudaError_t cudaErr = cudaStreamCreate(&stream);
if(cudaErr != 0)
return 0;
bool status = context->enqueueV2(buffers, stream, nullptr);
if (!status)
return 0;
cudaMemcpy(outputPred, outputBuffer, outputSize*sizeof(float), cudaMemcpyDeviceToHost);
cudaStreamDestroy(stream);
cudaFree(outputBuffer);
cudaFree(inputBuffer);
free(outputPred);
free(inputImg);
IExecutionContext *context1 = engine->createExecutionContext();
if (!context)
return 0;
context1->setOptimizationProfile(1);
context1->setBindingDimensions(2, Dims4(2,224,224,3));
void* inputBuffer1;
if(cudaMalloc(&inputBuffer1, inputSize*sizeof(float)) != 0)
return 0;
void* outputBuffer1;
if(cudaMalloc(&outputBuffer1, outputSize * sizeof(float)) != 0)
return 0;
float* inputImg1 = (float*) malloc(inputSize*sizeof(float));
// write dummyImg to inputImg1
cudaMemcpy(inputBuffer1, inputImg1, inputSize*sizeof(float), cudaMemcpyHostToDevice);
float* outputPred1 = (float*) malloc(outputSize*sizeof(float));
cudaMemcpy(outputBuffer1, outputPred1, outputSize*sizeof(float), cudaMemcpyHostToDevice);
void* buffers1[2];
buffers1[inputIndex] = inputBuffer1;
buffers1[outputIndex] = outputBuffer1;
cudaStream_t stream1;
cudaErr = cudaStreamCreate(&stream1);
if(cudaErr != 0)
return 0;
status = context->enqueueV2(buffers1, stream, nullptr);
if (!status)
return 0;
cudaMemcpy(outputPred1, outputBuffer1, outputSize*sizeof(float), cudaMemcpyDeviceToHost);
cudaStreamDestroy(stream1);
cudaFree(outputBuffer1);
cudaFree(inputBuffer1);
free(outputPred1);
free(inputImg1);
And I received the following output/error:
[07/15/2021-10:40:57] [I] [TRT] [MemUsageChange] Init CUDA: CPU +433, GPU +0, now: CPU 19762, GPU 4257 (MiB)
[07/15/2021-10:40:57] [I] [TRT] Loaded engine size: 1604 MB
[07/15/2021-10:40:57] [I] [TRT] [MemUsageSnapshot] deserializeCudaEngine begin: CPU 19762 MiB, GPU 4257 MiB
[07/15/2021-10:41:00] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:00] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +445, GPU +166, now: CPU 20212, GPU 6024 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +214, GPU +170, now: CPU 20426, GPU 6194 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +0, now: CPU 20426, GPU 6176 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] deserializeCudaEngine end: CPU 20426 MiB, GPU 6176 MiB
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation begin: CPU 18821 MiB, GPU 6176 MiB
[07/15/2021-10:41:01] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +10, now: CPU 18821, GPU 6186 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 18821, GPU 6194 (MiB)
[07/15/2021-10:41:01] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation end: CPU 18855 MiB, GPU 6369 MiB
[07/15/2021-10:41:02] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation begin: CPU 19287 MiB, GPU 6529 MiB
[07/15/2021-10:41:02] [W] [TRT] TensorRT was linked against cuBLAS/cuBLAS LT 11.5.1 but loaded cuBLAS/cuBLAS LT 11.4.2
[07/15/2021-10:41:02] [I] [TRT] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 19287, GPU 6537 (MiB)
[07/15/2021-10:41:02] [I] [TRT] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 19287, GPU 6547 (MiB)
[07/15/2021-10:41:02] [I] [TRT] Could not set default profile 0 for execution context. Profile index must be set explicitly.
[07/15/2021-10:41:02] [I] [TRT] [MemUsageSnapshot] ExecutionContext creation end: CPU 19287 MiB, GPU 6722 MiB
[07/15/2021-10:41:02] [E] [TRT] 1: [hardwareContext.cpp::nvinfer1::rt::CommonContext::configure::92] Error Code 1: Cudnn (CUDNN_STATUS_MAPPING_ERROR)