Description
I write an inference code using tensorrt to realize a super resolution mission with model real-ESRGAN which is a complete convolution network.
The pipeline of my workflow is :
1, export an onnx model from pytorch
2, add postprocess to the onnx model
3, export a tensorrt model from onnx
4, write the inference code with tensorrt
When I run the inference code with different resolution images one by one, I found that the speed increased rapidly when the resolution became lager and the inference is too slow according to my experience.For example, I got about 60ms with 100100 input image and about 360ms with 400400 input image.
Environment
TensorRT Version: --fix-missing
GPU Type: 2070 Super
Nvidia Driver Version: 495.29.05
CUDA Version: 11.5.2
CUDNN Version: 8.3.2.44
Operating System + Version: ubuntu 20.04
Python Version (if applicable): 3.9
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 2.0.0
Baremetal or Container (if container which image + tag):
Relevant Files
Please attach or include links to any models, data, files, or scripts necessary to reproduce your issue. (Github repo, Google Drive, Dropbox, etc.)
Steps To Reproduce
1, export an onnx model from pytorch
2, add postprocess to the onnx model
3, export a tensorrt model from onnx
4, write the inference code with tensorrt
The fragment of inference code with tensorrt is below:
void inference(Job& job){
auto ppre_start_time = std::chrono::high_resolution_clock::now();
/*----------------------------前前处理计时开始---------------------------------*/
checkRuntime(cudaStreamCreate(&stream));
cv::Mat image = cv::imdecode(job.input_image, cv::IMREAD_COLOR);
int input_channel = image.channels();
int input_height = image.rows;
int input_width = image.cols;
int input_numel = input_batch * input_channel * input_height * input_width;
float* input_data_host = nullptr;
float* input_data_device = nullptr;
checkRuntime(cudaMallocHost(&input_data_host, input_numel * sizeof(float)));
checkRuntime(cudaMalloc(&input_data_device, input_numel * sizeof(float)));
/*----------------------------前前处理计时结束---------------------------------*/
auto ppre_end_time = std::chrono::high_resolution_clock::now();
auto ppre_duration = std::chrono::duration_cast<std::chrono::milliseconds>(ppre_end_time - ppre_start_time);
printf("前前处理执行时间: %ld ms\n", ppre_duration.count());
auto pre_start_time = std::chrono::high_resolution_clock::now();
/*----------------------------前处理+推理计时开始---------------------------------*/
int image_area = image.cols * image.rows;
unsigned char* pimage = image.data;
float* phost_b = input_data_host + image_area * 0;
float* phost_g = input_data_host + image_area * 1;
float* phost_r = input_data_host + image_area * 2;
for(int i = 0; i < image_area; ++i, pimage += 3){
// 注意这里的顺序rgb调换了
*phost_r++ = pimage[0] / 255.0f ;
*phost_g++ = pimage[1] / 255.0f;
*phost_b++ = pimage[2] / 255.0f;
}
checkRuntime(cudaMemcpyAsync(input_data_device, input_data_host, input_numel * sizeof(float), cudaMemcpyHostToDevice, stream));
int output_batch = input_batch;
int output_channel = input_channel;
int output_height = input_height * scale_factor;
int output_width = input_width * scale_factor;
int output_numel = output_batch * output_channel * output_height * output_width;
float* output_data_host = nullptr;
float* output_data_device = nullptr;
checkRuntime(cudaMallocHost(&output_data_host, output_numel * sizeof(float)));
checkRuntime(cudaMalloc(&output_data_device, output_numel * sizeof(float)));
// 明确当前推理时,使用的数据输入大小
auto input_dims = execution_context->getBindingDimensions(0);
input_dims.d[2] = input_height;
input_dims.d[3] = input_width;
// for(int i=0;i<4;++i){
// printf("第%d个维度:%d\n", i, input_dims.d[i]);
// }
// 设置当前推理时,input大小
execution_context->setBindingDimensions(0, input_dims);
float* bindings[] = {input_data_device, output_data_device};
bool success = execution_context->enqueueV2((void**)bindings, stream, nullptr); // todo : 加个推理成功与否的判断
checkRuntime(cudaMemcpyAsync(output_data_host, output_data_device, output_numel * sizeof(float), cudaMemcpyDeviceToHost, stream));
checkRuntime(cudaStreamSynchronize(stream));
/*----------------------------推理计时结束---------------------------------*/
auto pre_end_time = std::chrono::high_resolution_clock::now();
auto pre_duration = std::chrono::duration_cast<std::chrono::milliseconds>(pre_end_time - pre_start_time);
printf("推理执行时间: %ld ms\n", pre_duration.count());
// printf("推理成功!\n");
auto post_start_time = std::chrono::high_resolution_clock::now();
/*----------------------------后处理计时开始---------------------------------*/
unsigned char* output_data_uchar = new unsigned char[output_numel];
for (int i = 0; i < output_numel; ++i){
output_data_uchar[i] = static_cast<uint8_t>(output_data_host[i]);
}
cv::Mat output_image(output_height, output_width, CV_8UC3, output_data_uchar);
checkRuntime(cudaFreeHost(input_data_host));
checkRuntime(cudaFree(input_data_device));
checkRuntime(cudaFree(output_data_device));
checkRuntime(cudaFreeHost(output_data_host));
checkRuntime(cudaStreamDestroy(stream));
delete [] output_data_uchar;
std::vector<uint8_t> output_image_bytes;
cv::imencode(image_type, output_image, output_image_bytes);
/*----------------------------后处理计时结束---------------------------------*/
auto post_end_time = std::chrono::high_resolution_clock::now();
auto post_duration = std::chrono::duration_cast<std::chrono::milliseconds>(post_end_time - post_start_time);
printf("后处理执行时间: %ld ms\n", post_duration.count());
job.pro->set_value(output_image_bytes);
}
Please include:
- Exact steps/commands to build your repro
- Exact steps/commands to run your repro
- Full traceback of errors encountered